databrickslabs · mwojtyczka · May 27, 2025 · May 6, 2025 · May 6, 2025 · May 6, 2025
@@ -62,12 +62,15 @@
 user_name = spark.sql("select current_user() as user").collect()[0]["user"]
 checks_file = f"/Workspace/Users/{user_name}/dqx_demo_checks.yml"
 dq_engine = DQEngine(ws)
-dq_engine.save_checks_in_workspace_file(checks, workspace_path=checks_file)
+dq_engine.save_checks_in_workspace_file(checks=checks, workspace_path=checks_file)
+
+# save generated checks in a Delta table
+dq_engine.save_checks_in_table(checks=checks, table_name="main.default.dqx_checks_table", mode="overwrite")
 
 # COMMAND ----------
 
 # MAGIC %md
-# MAGIC ## Loading and applying quality checks
+# MAGIC ## Loading and applying quality checks from a file
 
 # COMMAND ----------
 
@@ -76,7 +79,7 @@
 
 input_df = spark.createDataFrame([[1, 3, 3, 2], [3, 3, None, 1]], schema)
 
-# load checks
+# load checks from a file
 dq_engine = DQEngine(WorkspaceClient())
 checks = dq_engine.load_checks_from_workspace_file(workspace_path=checks_file)
 
@@ -91,6 +94,31 @@
 
 # COMMAND ----------
 
+# MAGIC %md
+# MAGIC ## Loading and applying quality checks from a Delta table
+
+# COMMAND ----------
+
+from databricks.labs.dqx.engine import DQEngine
+from databricks.sdk import WorkspaceClient
+
+input_df = spark.createDataFrame([[1, 3, 3, 2], [3, 3, None, 1]], schema)
+
+# load checks from a Delta table
+dq_engine = DQEngine(WorkspaceClient())
+checks = dq_engine.load_checks_from_table(table_name="main.default.dqx_checks_table")
+
+# Option 1: apply quality rules and quarantine invalid records
+valid_df, quarantined_df = dq_engine.apply_checks_by_metadata_and_split(input_df, checks)
+display(valid_df)
+display(quarantined_df)
+
+# Option 2: apply quality rules and flag invalid records as additional columns (`_warning` and `_error`)
+valid_and_quarantined_df = dq_engine.apply_checks_by_metadata(input_df, checks)
+display(valid_and_quarantined_df)
+
+# COMMAND ----------
+
 # MAGIC %md
 # MAGIC ## Validating syntax of quality checks defined in yaml
 

@@ -160,17 +160,41 @@ Fields:
 - `criticality`: either "error" (data going only into "bad/quarantine" dataframe) or "warn" (data going into both "good" and "bad" dataframes). If not provided, the default is "error".
 - `check`: column expression containing "function" (check function to apply), "arguments" (check function arguments), and "col_name" (column name as `str` or sql expression the check will be applied for) or "col_names" (column names as `array` the check will be applied for).
 - (optional) `name` for the check: autogenerated if not provided.
+- (optional) `filter` to filter the rows for which the check is applied (e.g. `"business_unit = 'Finance'"`)
+
+### Quality rules configured in a Delta table
+
+Quality rules can also be stored in a Delta table in Unity Catalog. Each row represents a check with column values for the `name`, `check`, `criticality`, `filter`, and `run_config_name`.
+
+```python
+# The checks table will contain the following columns:
+# +------------------+-----------------------+---------------------------------------+----------------------+--------------------+
+# | name             | criticality           | check                                 | filter               | run_config_name    |
+# +------------------+-----------------------+---------------------------------------+----------------------+--------------------+
+# | "city_is_null"   | "warn"                | {function: 'is_not_null',             | "country = 'Poland'" | "default"          |
+# |                  |                       |  arguments: {'col_name': 'city'}}     |                      |                    |
+# | ...              | ...                   | ...                                   | ...                  | ...                |
+# +------------------+-----------------------+---------------------------------------+----------------------+--------------------+
+```
+Fields:
+- `criticality`: either "error" (data going only into "bad/quarantine" dataframe) or "warn" (data going into both "good" and "bad" dataframes). If not provided, the default is "error".
+- `check`: a `StructType` value with the following fields:
+    - `function`: Name of the DQX check function to apply
+    - `arguments`: A `MapType` value with the function's keyword arguments as key-value pairs
+- (optional) `name`: Name to use for the check
+- (optional) `filter`: Spark expression to filter the rows for which the check is applied (e.g. `"business_unit = 'Finance'"`)
+- `run_config_name`: A run or workflow name. Can be used to load and apply a subset of checks to specific workflows. Default value is `"default"`.
 
 ### Loading and execution methods
 
-Checks can be loaded from a file in the installation folder, workspace, or local file system. The engine will raise an error if the checks file contains invalid JSON or YAML definition.
+Checks can be loaded from a Delta table in Unity Catalog or from a file in the installation folder, workspace, or local file system. The engine will raise an error if the stored checks (either from a file or table) contain an invalid definition.
 
-Checks loaded from a file can be applied using one of the following methods:
+Checks loaded from a file or table can be applied using one of the following methods:
 * `apply_checks_by_metadata_and_split`: splits the input data into valid and invalid (quarantined) dataframes.
 * `apply_checks_by_metadata`: report issues as additional columns.
 
 Syntax of the loaded checks is validated automatically as part of these methods.
-In addition, you can also perform a standalone syntax validation of the checks as described [here](#validating-syntax-of-quality-checks-defined-in-yamljson).
+In addition, you can also perform a standalone syntax validation of the checks as described [here](#validating-syntax-of-quality-checks-defined-in-configuration).
 
 #### Method 1: Loading checks from a workspace file in the installation folder
 
@@ -239,6 +263,34 @@ In addition, you can also perform a standalone syntax validation of the checks a
   </TabItem>
 </Tabs>
 
+#### Method 4: Loading checks from a Delta table
+
+<Tabs>
+  <TabItem value="Python" label="Python" default>
+    ```python
+    from databricks.labs.dqx.engine import DQEngine
+    from databricks.sdk import WorkspaceClient
+
+    dq_engine = DQEngine(WorkspaceClient())
+
+    # Load all checks with the "default" `run_config_name`:
+    default_checks = dq_engine.load_checks_from_table("dq.config.checks_table")
+
+    # Load checks with the "workflow_001" `run_config_name`:
+    workflow_checks = dq_engine.load_checks_from_table("dq.config.checks_table", "workflow_001")
+
+    checks = default_checks + workflow_checks
+    input_df = spark.read.table("catalog1.schema1.table1")
+
+    # Option 1: apply quality rules on the dataframe and provide valid and invalid (quarantined) dataframes
+    valid_df, quarantined_df = dq_engine.apply_checks_by_metadata_and_split(input_df, checks)
+
+    # Option 2: apply quality rules on the dataframe and report issues as additional columns (`_warning` and `_error`)
+    valid_and_quarantined_df = dq_engine.apply_checks_by_metadata(input_df, checks)
+    ```
+  </TabItem>
+</Tabs>
+
 ### Quality rules defined in code
 
 #### Method 1: Using DQX classes
@@ -427,9 +479,9 @@ The DQX integration with DLT does not use DLT Expectations but DQX's own methods
   </TabItem>
 </Tabs>
 
-## Validating syntax of quality checks defined in yaml/json
+## Validating syntax of quality checks defined in configuration
 
-You can validate the syntax of checks defined as metadata in `yaml` or `json` format before applying them. This validation ensures that the checks are correctly defined and can be interpreted by the DQX engine.
+You can validate the syntax of checks defined as metadata in a Delta table or file (either `yaml` or `json`) before applying them. This validation ensures that the checks are correctly defined and can be interpreted by the DQX engine.
 The validation cannot be used for checks defined using [DQX classes](#method-1-using-dqx-classes). When checks are defined with DQX classes, syntax validation is unnecessary because the application will fail to interpret them if the DQX objects are constructed incorrectly.
 
 <Tabs>

@@ -413,7 +413,7 @@ max-args = 10
 max-positional-arguments=10
 
 # Maximum number of attributes for a class (see R0902).
-max-attributes = 15
+max-attributes = 16
 
 # Maximum number of boolean expressions in an if statement (see R0916).
 max-bool-expr = 5

@@ -17,6 +17,7 @@ class RunConfig:
     output_table: str | None = None  # output data table
     quarantine_table: str | None = None  # quarantined data table
     checks_file: str | None = "checks.yml"  # file containing quality rules / checks
+    checks_table: str | None = None  # table containing quality rules / checks
     profile_summary_stats_file: str | None = "profile_summary_stats.yml"  # file containing profile summary statistics
     override_clusters: dict[str, str] | None = None  # cluster configuration for jobs
     spark_conf: dict[str, str] | None = None  # extra spark configs

@@ -3,12 +3,13 @@
 import functools as ft
 import inspect
 import itertools
+import warnings
 from pathlib import Path
 from collections.abc import Callable
 from typing import Any
 import yaml
 import pyspark.sql.functions as F
-from pyspark.sql import DataFrame
+from pyspark.sql import DataFrame, SparkSession
 
 from databricks.labs.blueprint.installation import Installation
 from databricks.labs.dqx import row_checks
@@ -30,6 +31,7 @@
 from databricks.sdk import WorkspaceClient
 
 logger = logging.getLogger(__name__)
+COLLECT_LIMIT_WARNING = 500
 
 
 class DQEngineCore(DQEngineCoreBase):
@@ -142,6 +144,72 @@ def save_checks_in_local_file(checks: list[dict], filepath: str):
             msg = f"Checks file {filepath} missing"
             raise FileNotFoundError(msg) from None
 
+    @staticmethod
+    def build_quality_rules_from_dataframe(df: DataFrame, run_config_name: str = "default") -> list[dict]:
+        """Build checks from a Spark DataFrame based on check specifications, i.e. function name plus arguments.
+
+        :param df: Spark DataFrame with data quality check rules. Each row should define a check. Rows should
+        have the following columns:
+        * `name` - Name that will be given to a resulting column. Autogenerated if not provided
+        * `criticality` (optional) - Possible values are `error` (data going only into "bad" dataframe) and `warn` (data is going into both dataframes)
+        * `check` - DQX check function used in the check; A `StructType` column defining the data quality check
+        * `filter` - Expression for filtering data quality checks
+        * `run_config_name` (optional) - Run configuration name for storing checks across runs
+        :param run_config_name: Run configuration name for filtering quality rules
+        :return: List of data quality check specifications as a Python dictionary
+        """
+        check_rows = df.where(f"run_config_name = '{run_config_name}'").collect()
+        if len(check_rows) > COLLECT_LIMIT_WARNING:
+            warnings.warn(
+                f"Collecting large number of rows from Spark DataFrame: {len(check_rows)}",
+                category=UserWarning,
+                stacklevel=2,
+            )
+        checks = []
+        for row in check_rows:
+            check = {"name": row.name, "criticality": row.criticality, "check": row.check.asDict()}
+            if row.filter is not None:
+                check["filter"] = row.filter
+            checks.append(check)
+        return checks
+
+    @staticmethod
+    def build_dataframe_from_quality_rules(
+        checks: list[dict], run_config_name: str = "default", spark: SparkSession | None = None
+    ) -> DataFrame:
+        """Build a Spark DataFrame from a set of check specifications, i.e. function name plus arguments.
+
+        :param checks: list of check specifications as Python dictionaries. Each check consists of the following fields:
+            * `check` - Column expression to evaluate. This expression should return string value if it's evaluated to true (it will be used as an error/warning message) or `null` if it's evaluated to `false`
+            * `name` - Name that will be given to a resulting column. Autogenerated if not provided
+            * `criticality` (optional) - Possible values are `error` (data going only into "bad" dataframe) and `warn` (data is going into both dataframes)
+            * `filter` (optional) - Expression for filtering data quality checks
+        :param run_config_name: Run configuration name for storing quality checks across runs
+        :param spark: SparkSession to use for DataFrame operations
+        :return: Spark DataFrame with data quality check rules
+        """
+        if spark is None:
+            spark = SparkSession.builder.getOrCreate()
+        schema = "name STRING, criticality STRING, check STRUCT<function STRING, arguments MAP<STRING, STRING>>, filter STRING, run_config_name STRING"
+        dq_rule_checks = DQEngineCore.build_checks_by_metadata(checks)
+        dq_rule_rows = []
+        for dq_rule_check in dq_rule_checks:
+            arguments = dq_rule_check.check_func_kwargs
+            if isinstance(dq_rule_check, DQColSetRule):
+                arguments["col_names"] = dq_rule_check.columns
+            if isinstance(dq_rule_check, DQColRule):
+                arguments["col_name"] = dq_rule_check.col_name
+            dq_rule_rows.append(
+                [
+                    dq_rule_check.name,
+                    dq_rule_check.criticality,
+                    {"function": dq_rule_check.check_func.__name__, "arguments": arguments},
+                    dq_rule_check.filter,
+                    run_config_name,
+                ]
+            )
+        return spark.createDataFrame(dq_rule_rows, schema)
+
     @staticmethod
     def build_checks_by_metadata(checks: list[dict], custom_checks: dict[str, Any] | None = None) -> list[DQColRule]:
         """Build checks based on check specification, i.e. function name plus arguments.
@@ -590,6 +658,23 @@ def load_checks_from_installation(
             raise ValueError(f"Invalid or no checks in workspace file: {installation.install_folder()}/{filename}")
         return parsed_checks
 
+    def load_checks_from_table(
+        self, table_name: str, run_config_name: str = "default", spark: SparkSession | None = None
+    ) -> list[dict]:
+        """
+        Load checks (dq rules) from a Delta table in the workspace.
+        :param table_name: Unity catalog or Hive metastore table name
+        :param run_config_name: Run configuration name for filtering checks
+        :param spark: Optional SparkSession
+        :return: List of dq rules or raise an error if checks file is missing or is invalid.
+        """
+        logger.info(f"Loading quality rules (checks) from table {table_name}")
+        if not self.ws.tables.exists(table_name).table_exists:
+            raise NotFound(f"Table {table_name} does not exist in the workspace")
+        if spark is None:
+            spark = SparkSession.builder.getOrCreate()
+        return DQEngine._load_checks_from_table(table_name, run_config_name, spark)
+
     @staticmethod
     def save_checks_in_local_file(checks: list[dict], path: str):
         return DQEngineCore.save_checks_in_local_file(checks, path)
@@ -633,6 +718,20 @@ def save_checks_in_workspace_file(self, checks: list[dict], workspace_path: str)
             workspace_path, yaml.safe_dump(checks).encode('utf-8'), format=ImportFormat.AUTO, overwrite=True
         )
 
+    @staticmethod
+    def save_checks_in_table(
+        checks: list[dict], table_name: str, run_config_name: str = "default", mode: str = "append"
+    ):
+        """
+        Save checks to a Delta table in the workspace.
+        :param checks: list of dq rules to save
+        :param table_name: Unity catalog or Hive metastore table name
+        :param run_config_name: Run configuration name for identifying groups of checks
+        :param mode: Output mode for writing checks to Delta (e.g. `append` or `overwrite`)
+        """
+        logger.info(f"Saving quality rules (checks) to table {table_name}")
+        DQEngine._save_checks_in_table(checks, table_name, run_config_name, mode)
+
     def load_run_config(
         self, run_config_name: str | None = "default", assume_user: bool = True, product_name: str = "dqx"
     ) -> RunConfig:
@@ -670,3 +769,15 @@ def _load_checks_from_file(installation: Installation, filename: str) -> list[di
         except NotFound:
             msg = f"Checks file {filename} missing"
             raise NotFound(msg) from None
+
+    @staticmethod
+    def _load_checks_from_table(table_name: str, run_config_name: str, spark: SparkSession | None = None) -> list[dict]:
+        if spark is None:
+            spark = SparkSession.builder.getOrCreate()
+        rules_df = spark.read.table(table_name)
+        return DQEngineCore.build_quality_rules_from_dataframe(rules_df, run_config_name=run_config_name)
+
+    @staticmethod
+    def _save_checks_in_table(checks: list[dict], table_name: str, run_config_name: str, mode: str):
+        rules_df = DQEngineCore.build_dataframe_from_quality_rules(checks, run_config_name=run_config_name)
+        rules_df.write.saveAsTable(table_name, mode=mode)
@@ -80,12 +80,7 @@ def config(self) -> WorkspaceConfig:
 class MockInstallationContext(MockRuntimeContext):
     __test__ = False
 
-    def __init__(
-        self,
-        env_or_skip_fixture,
-        ws,
-        check_file,
-    ):
+    def __init__(self, env_or_skip_fixture, ws, check_file):
         super().__init__(env_or_skip_fixture, ws)
         self.check_file = check_file
 
@@ -170,16 +165,8 @@ def workspace_installation(self) -> WorkspaceInstallation:
 
 
 @pytest.fixture
-def installation_ctx(
-    ws,
-    env_or_skip,
-    check_file="checks.yml",
-) -> Generator[MockInstallationContext, None, None]:
-    ctx = MockInstallationContext(
-        env_or_skip,
-        ws,
-        check_file,
-    )
+def installation_ctx(ws, env_or_skip, check_file="checks.yml") -> Generator[MockInstallationContext, None, None]:
+    ctx = MockInstallationContext(env_or_skip, ws, check_file)
     yield ctx.replace(workspace_client=ws)
     ctx.workspace_installation.uninstall()