databrickslabs · cornzyblack · Jul 17, 2025 · Jul 17, 2025 · Jul 18, 2025 · Jul 23, 2025
@@ -38,6 +38,8 @@ You can also define your own custom checks (see [Creating custom checks](#creati
 | `is_not_greater_than`              | Checks whether the values in the input column are not greater than the provided limit.                                                                                                                                                                                                                                                                                                                                            | `column`: column to check (can be a string column name or a column expression); `limit`: limit as number, date, timestamp, column name or sql expression                                                                                                                                                                                                                                                                                                                                                                                             |
 | `is_valid_date`                    | Checks whether the values in the input column have valid date formats.                                                                                                                                                                                                                                                                                                                                                            | `column`: column to check (can be a string column name or a column expression); `date_format`: optional date format (e.g. 'yyyy-mm-dd')                                                                                                                                                                                                                                                                                                                                                                                                              |
 | `is_valid_timestamp`               | Checks whether the values in the input column have valid timestamp formats.                                                                                                                                                                                                                                                                                                                                                       | `column`: column to check (can be a string column name or a column expression); `timestamp_format`: optional timestamp format (e.g. 'yyyy-mm-dd HH:mm:ss')                                                                                                                                                                                                                                                                                                                                                                                           |
+| `is_valid_json`                    | Checks whether the values in the input column are valid JSON objects.                                                                                                                                                                                                                                                                                                                                                             | `column`: column to check (can be a string column name or a column expression)                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| `has_json_keys`                    | Checks whether the values in the input column contain specific JSON keys.                                                                                                                                                                                                                                                                                                                                                         | `column`: column to check (can be a string column name or a column expression); `keys`: list of JSON keys to check for                                                                                                                                                                                                                                                                                                                                                                                                                               |
-| `is_valid_json`                    | Checks whether the values in the input column are valid JSON objects.                                                                                                                                                                                                                                                                                                                                                             | `column`: column to check (can be a string column name or a column expression)                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
-| `has_json_keys`                    | Checks whether the values in the input column contain specific JSON keys.                                                                                                                                                                                                                                                                                                                                                         | `column`: column to check (can be a string column name or a column expression); `keys`: list of JSON keys to check for                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| `is_valid_json`                    | Checks whether the values in the input column are valid JSON (objects, arrays, strings, numbers, etc.), not just JSON objects.                                                                                                                                                                                                                                                                                                    | `column`: column to check (can be a string column name or a column expression)                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| `has_json_keys`                    | Checks whether the values in the input column contain specific JSON keys.                                                                                                                                                                                                                                                                                                                                                         | `column`: column to check (can be a string column name or a column expression); `keys`: list of JSON keys to check for; `require_all`: whether all keys must be present (default: True)                                                                                                                                                                                                                                                                                                                                                              |
-| `is_valid_json`                    | Checks whether the values in the input column are valid JSON objects.                                                                                                                                                                                                                                                                                                                                                             | `column`: column to check (can be a string column name or a column expression)                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
-| `has_json_keys`                    | Checks whether the values in the input column contain specific JSON keys.                                                                                                                                                                                                                                                                                                                                                         | `column`: column to check (can be a string column name or a column expression); `keys`: list of JSON keys to check for                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| `is_valid_json`                    | Checks whether the values in the input column are valid JSON (objects, arrays, strings, numbers, etc.), not just JSON objects.                                                                                                                                                                                                                                                                                                    | `column`: column to check (can be a string column name or a column expression)                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| `has_json_keys`                    | Checks whether the values in the input column contain specific JSON keys.                                                                                                                                                                                                                                                                                                                                                         | `column`: column to check (can be a string column name or a column expression); `keys`: list of JSON keys to check for; `require_all`: whether all keys must be present (default: True)                                                                                                                                                                                                                                                                                                                                                              |
 | `is_not_in_future`                 | Checks whether the values in the input column contain a timestamp that is not in the future, where 'future' is defined as current_timestamp + offset (in seconds).                                                                                                                                                                                                                                                                | `column`: column to check (can be a string column name or a column expression); `offset`: offset to use; `curr_timestamp`: current timestamp, if not provided current_timestamp() function is used                                                                                                                                                                                                                                                                                                                                                   |
 | `is_not_in_near_future`            | Checks whether the values in the input column contain a timestamp that is not in the near future, where 'near future' is defined as greater than the current timestamp but less than the current_timestamp + offset (in seconds).                                                                                                                                                                                                 | `column`: column to check (can be a string column name or a column expression); `offset`: offset to use; `curr_timestamp`: current timestamp, if not provided current_timestamp() function is used                                                                                                                                                                                                                                                                                                                                                   |
 | `is_older_than_n_days`             | Checks whether the values in one input column are at least N days older than the values in another column.                                                                                                                                                                                                                                                                                                                        | `column`: column to check (can be a string column name or a column expression); `days`: number of days; `curr_date`: current date, if not provided current_date() function is used; `negate`: if the condition should be negated                                                                                                                                                                                                                                                                                                                     |
@@ -322,6 +324,33 @@ For brevity, the `name` field in the examples is omitted and it will be auto-gen
       column: col5
       date_format: yyyy-MM-dd
 
+# is_valid_json check
+- criticality: error
+  check:
+    function: is_valid_json
+    arguments:
+      column: col_json_str
+
+# has_json_keys check
+- criticality: error
+  check:
+    function: has_json_keys
+    arguments:
+      column: col_json_str
+      keys:
+        - key1
+
+- criticality: error
+  name: col_json_str_has_no_json_keys2
+  check:
+    function: has_json_keys
+    arguments:
+      column: col_json_str
+      keys:
+        - key1
+        - key2
+      require_all: False
+
 # is_valid_timestamp check
 - criticality: error
   check:
@@ -531,42 +560,42 @@ For brevity, the `name` field in the examples is omitted and it will be auto-gen
     function: is_linestring
     arguments:
       column: linestring_geom
-      
+
 # is_polygon check
 - criticality: error
   check:
     function: is_polygon
     arguments:
       column: polygon_geom
-      
+
 # is_multipoint check
 - criticality: error
   check:
     function: is_multipoint
     arguments:
       column: multipoint_geom
-      
+
 # is_multilinestring check
 - criticality: error
   check:
     function: is_multilinestring
     arguments:
       column: multilinestring_geom
-      
+
 # is_multipolygon check
 - criticality: error
   check:
     function: is_multipolygon
     arguments:
       column: multipolygon_geom
-      
+
 # is_geometrycollection check
 - criticality: error
   check:
     function: is_geometrycollection
     arguments:
       column: geometrycollection_geom
-      
+
 # is_ogc_valid check
 - criticality: error
   check:
@@ -580,15 +609,15 @@ For brevity, the `name` field in the examples is omitted and it will be auto-gen
     function: is_non_empty_geometry
     arguments:
       column: point_geom
-      
+
 # has_dimension check
 - criticality: error
   check:
     function: has_dimension
     arguments:
       column: polygon_geom
       dimension: 2
-      
+
 # has_x_coordinate_between check
 - criticality: error
   check:
@@ -597,7 +626,7 @@ For brevity, the `name` field in the examples is omitted and it will be auto-gen
       column: polygon_geom
       min_value: 0.0
       max_value: 10.0
-      
+
 # has_y_coordinate_between check
 - criticality: error
   check:
@@ -606,6 +635,7 @@ For brevity, the `name` field in the examples is omitted and it will be auto-gen
       column: polygon_geom
       min_value: 0.0
       max_value: 10.0
+
 ```
 </details>
 
@@ -863,6 +893,22 @@ checks = [
       name="col5_is_not_valid_date2"
     ),
 
+    DQRowRule(
+      criticality="error",
+      check_func=check_funcs.has_json_keys,
+      column="col_json_str", # or as expr: F.col("col_json_str")
+      check_func_kwargs={"keys": ["key1"]},
+      name="col_json_str_has_json_keys"
+    ),
+
+    DQRowRule(
+      criticality="error",
+      check_func=check_funcs.has_json_keys,
+      column="col_json_str", # or as expr: F.col("col_json_str")
+      check_func_kwargs={"keys": ["key1", "key2"], "require_all": False},
+      name="col_json_str_has_json_keys"
+    ),
+
     # is_valid_timestamp check
     DQRowRule(
       criticality="error",
@@ -878,6 +924,13 @@ checks = [
       name="col6_is_not_valid_timestamp2"
     ),
 
+    # is_valid_json check
+    DQRowRule(
+      criticality="error",
+      check_func=check_funcs.is_valid_json,
+      column="col_json_str"
+    ),
+
     # is_not_in_future check
     DQRowRule(
       criticality="error",
@@ -1015,7 +1068,7 @@ checks = [
       check_func=geo_check_funcs.is_multilinestring,
       column="multilinestring_geom"
     ),
-    
+
     # is_multipolygon check
     DQRowRule(
       criticality="error",
@@ -3021,7 +3074,7 @@ The PII detection extras include a built-in `does_not_contain_pii` check that us
        function: does_not_contain_pii
        arguments:
          column: description
-   
+
    # PII detection check with custom threshold and named entities
    - criticality: error
      check:
@@ -3038,7 +3091,7 @@ The PII detection extras include a built-in `does_not_contain_pii` check that us
    ```python
    from databricks.labs.dqx.rule import DQRowRule
    from databricks.labs.dqx.pii.pii_detection_funcs import does_not_contain_pii
-   
+
    checks = [
         # Basic PII detection check
         DQRowRule(
@@ -3056,7 +3109,7 @@ The PII detection extras include a built-in `does_not_contain_pii` check that us
             check_func_kwargs={"threshold": 0.8, "entities": ["PERSON", "EMAIL_ADDRESS"]}
         ),
    ]
-   ``` 
+   ```
   </TabItem>
 </Tabs>
 
@@ -3093,7 +3146,7 @@ These can be loaded using `NLPEngineConfig`:
    from databricks.labs.dqx.rule import DQRowRule
    from databricks.labs.dqx.pii.pii_detection_funcs import does_not_contain_pii
    from databricks.labs.dqx.pii.nlp_engine_config import NLPEngineConfig
-   
+
    checks = [
         # PII detection check using spacy as a named entity recognizer
         DQRowRule(
@@ -3102,7 +3155,7 @@ These can be loaded using `NLPEngineConfig`:
             column="description",
             check_func=does_not_contain_pii,
             check_func_kwargs={"nlp_engine_config": NLPEngineConfig.SPACY_MEDIUM}
-        ),    
+        ),
    ]
    ```
    </TabItem>
@@ -3122,7 +3175,7 @@ Using custom models for named-entity recognition may require you to install thes
    from databricks.labs.dqx.rule import DQRowRule
    from databricks.labs.dqx.engine import DQEngine
    from databricks.sdk import WorkspaceClient
-   
+
    nlp_engine_config = {
         'nlp_engine_name': 'transformers_stanford_deidentifier_base',
         'models': [
@@ -3165,9 +3218,9 @@ Using custom models for named-entity recognition may require you to install thes
             column="description",
             check_func=does_not_contain_pii,
             check_func_kwargs={"nlp_engine_config": nlp_engine_config},
-        ),    
+        ),
    ]
-   
+
    dq_engine = DQEngine(WorkspaceClient())
    df = spark.read.table("main.default.table")
    valid_df, quarantine_df = dq_engine.apply_checks_and_split(df, checks)

@@ -1747,6 +1747,69 @@ def apply(df: DataFrame) -> DataFrame:
     return condition, apply
 
 
+@register_rule("row")
+def is_valid_json(column: str | Column) -> Column:
+    """
+    Checks whether the values in the input column is a valid JSON string.
+
+    Args:
+        column: Column name (str) or Column expression to check for valid JSON.
+
+    Returns:
+        A Spark Column representing the condition for invalid JSON strings.
+    """
+    col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column)
+    return make_condition(
+        ~F.when(F.col(col_expr_str).isNotNull(), F.try_parse_json(col_expr_str).isNotNull()),
+        F.concat_ws(
+            "", F.lit("Value '"), col_expr.cast("string"), F.lit(f"' in Column '{col_expr_str}' is not a valid JSON")
+        ),
+        f"{col_str_norm}_is_not_valid_json",
+    )
+
+
+@register_rule("row")
+def has_json_keys(column: str | Column, keys: list[str], require_all: bool = True) -> Column:
+    """
+    Checks whether the values in the input column contain specific JSON keys.
+
+    Args:
+        column (str | Column): The name of the column or the column itself to check for JSON keys.
-        column (str | Column): The name of the column or the column itself to check for JSON keys.
+        column (str | Column): The name of the column or the column expression to check for JSON keys.
-        column (str | Column): The name of the column or the column itself to check for JSON keys.
+        column (str | Column): The name of the column or the column expression to check for JSON keys.
+        keys (list[str]): The list of JSON keys to check for.
+        require_all (bool): If True, all specified keys must be present. If False, at least one key must be present.
+
+    Returns:
+        Column: A Spark Column representing the condition for missing JSON keys.
+    """
+    if not keys:
+        raise InvalidParameterError("The 'keys' parameter must be a non-empty list of strings.")
+    for key in keys:
+        if not isinstance(key, (str)):
+            raise InvalidParameterError("All keys must be of type string.")
+
+    unique_keys_lit = F.lit(list(set(keys)))
-    unique_keys_lit = F.lit(list(set(keys)))
+    unique_keys_lit = F.lit(sorted(set(keys)))
-    unique_keys_lit = F.lit(list(set(keys)))
+    unique_keys_lit = F.lit(sorted(set(keys)))
+    col_str_norm, col_expr_str, col_expr = _get_normalized_column_and_expr(column)
+    json_keys_array = F.json_object_keys(col_expr)
-    json_keys_array = F.json_object_keys(col_expr)
+    json_keys_array = F.expr(f"json_object_keys({col_expr_str})")
-    json_keys_array = F.json_object_keys(col_expr)
+    json_keys_array = F.expr(f"json_object_keys({col_expr_str})")
+
+    if require_all:
+        condition = F.size(F.array_except(unique_keys_lit, json_keys_array)) == 0
+    else:
+        condition = F.when(is_valid_json(col_str_norm).isNull(), F.arrays_overlap(json_keys_array, unique_keys_lit))
-
-    if require_all:
-        condition = F.size(F.array_except(unique_keys_lit, json_keys_array)) == 0
-    else:
-        condition = F.when(is_valid_json(col_str_norm).isNull(), F.arrays_overlap(json_keys_array, unique_keys_lit))
+    json_valid = F.try_parse_json(col_expr).isNotNull()
+
+    if require_all:
+        condition = json_valid & (F.size(F.array_except(unique_keys_lit, json_keys_array)) == 0)
+    else:
+        condition = json_valid & F.arrays_overlap(json_keys_array, unique_keys_lit)
-
-    if require_all:
-        condition = F.size(F.array_except(unique_keys_lit, json_keys_array)) == 0
-    else:
-        condition = F.when(is_valid_json(col_str_norm).isNull(), F.arrays_overlap(json_keys_array, unique_keys_lit))
+    json_valid = F.try_parse_json(col_expr).isNotNull()
+
+    if require_all:
+        condition = json_valid & (F.size(F.array_except(unique_keys_lit, json_keys_array)) == 0)
+    else:
+        condition = json_valid & F.arrays_overlap(json_keys_array, unique_keys_lit)
+
+    return make_condition(
+        ~condition,
+        F.concat_ws(
+            "",
+            F.lit("Value '"),
+            F.when(col_expr.isNull(), F.lit("null")).otherwise(col_expr.cast("string")),
+            F.lit(f"' in Column '{col_expr_str}' keys are not in the key list: ["),
+            F.concat_ws(", ", F.lit(keys)),
+            F.lit("]"),
+        ),
+        f"{col_str_norm}_has_no_json_keys",
-        f"{col_str_norm}_has_no_json_keys",
+        f"{col_str_norm}_has_no_json_{'_'.join(sorted(set(keys)))}{'_all' if require_all else '_any'}",
-        f"{col_str_norm}_has_no_json_keys",
+        f"{col_str_norm}_has_no_json_{'_'.join(sorted(set(keys)))}{'_all' if require_all else '_any'}",
+    )
+
+
 def _get_schema(input_schema: str | types.StructType, columns: list[str] | None = None) -> types.StructType:
     """
     Normalize the input schema into a Spark StructType schema.

@@ -189,6 +189,28 @@
     arguments:
       column: col5
       date_format: yyyy-MM-dd
+- criticality: error
+  check:
+    function: is_valid_json
+    arguments:
+      column: col_json_str
+- criticality: error
+  check:
+    function: has_json_keys
+    arguments:
+      column: col_json_str
+      keys:
+      - key1
+- criticality: error
+  name: col_json_str_has_no_json_keys2
-  name: col_json_str_has_no_json_keys2
+  name: col_json_str_has_no_json_key1_key2
-  name: col_json_str_has_no_json_keys2
+  name: col_json_str_has_no_json_key1_key2
+  check:
+    function: has_json_keys
+    arguments:
+      column: col_json_str
+      keys:
+      - key1
+      - key2
+      require_all: false
 - criticality: error
   check:
     function: is_valid_timestamp