diff --git a/.github/actions/upload-validation-result-schemas-findings/action.yml b/.github/actions/upload-validation-result-schemas-findings/action.yml new file mode 100644 index 000000000000..733342bb77e3 --- /dev/null +++ b/.github/actions/upload-validation-result-schemas-findings/action.yml @@ -0,0 +1,11 @@ +name: Upload VRS Findings +description: Upload validation result schema findings artifact +runs: + using: composite + steps: + - uses: actions/upload-artifact@v4 + if: always() + with: + name: validation-result-schemas-findings + path: tests/_artifacts/validation_result_schemas/findings/ + if-no-files-found: ignore diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4ae16542343e..15ce29c2cec5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -370,6 +370,10 @@ jobs: # TODO: revert the timeout back to 1.5 or lower after resolving arc issues run: invoke ci-tests -m "unit" --xdist --slowest=10 --timeout=2.0 --reports + - name: Upload validation result schema findings + if: always() + uses: ./.github/actions/upload-validation-result-schemas-findings + # upload coverage report to codecov - name: Upload coverage reports to Codecov continue-on-error: true @@ -843,6 +847,10 @@ jobs: esac invoke ci-tests '${{ matrix.markers }}' --up-services --verbose --reports $FLAGS + - name: Upload validation result schema findings + if: always() + uses: ./.github/actions/upload-validation-result-schemas-findings + # upload coverage report to codecov - name: Upload coverage reports to Codecov continue-on-error: true @@ -943,6 +951,10 @@ jobs: env: SHARD: ${{ matrix.shard }} + - name: Upload validation result schema findings + if: always() + uses: ./.github/actions/upload-validation-result-schemas-findings + # upload coverage report to codecov - name: Upload coverage reports to Codecov continue-on-error: true diff --git a/.gitignore b/.gitignore index 764f679fc439..2827ace2a805 100644 --- a/.gitignore +++ b/.gitignore @@ -166,3 +166,6 @@ assets/docker/mercury/volume/ # mise mise.toml + +# Test artifact output (validation result schemas findings, etc.) +/tests/_artifacts/ diff --git a/great_expectations/core/expectation_validation_result.py b/great_expectations/core/expectation_validation_result.py index 3da2a433542c..a37dc8ae0ab8 100644 --- a/great_expectations/core/expectation_validation_result.py +++ b/great_expectations/core/expectation_validation_result.py @@ -386,6 +386,48 @@ def describe(self) -> str: """JSON string description of this ExpectationValidationResult""" return json.dumps(self.describe_dict(), indent=4) + def as_typed(self, *, engine_hint: Optional[str] = None): + """Return a typed view of self.result without mutating anything. + + Lazy-imports the dispatcher to avoid an import cycle at module load. + Reads expectation_type from self.expectation_config.type and ResultFormat + from self.expectation_config.kwargs.get('result_format', DEFAULT_RESULT_FORMAT). + Returns the parsed model. Raises ParseError on validation failure. + + engine_hint: optional 'pandas' | 'spark' | 'sql'. When supplied, the + dispatcher uses it directly. When None, the dispatcher sniffs from the + result dict shape. + """ + from great_expectations.core.result_format import ( + DEFAULT_RESULT_FORMAT, + ResultFormat, + ) + from great_expectations.core.validation_result_schemas.dispatcher import ( + as_typed, + ) + + result_format_value = ( + self.expectation_config.kwargs.get("result_format", DEFAULT_RESULT_FORMAT) + if self.expectation_config + else DEFAULT_RESULT_FORMAT + ) + # ResultFormat may be string or enum; normalize + if isinstance(result_format_value, str): + result_format = ResultFormat(result_format_value) + elif isinstance(result_format_value, dict): + result_format = ResultFormat(result_format_value["result_format"]) + else: + result_format = result_format_value + + expectation_type = self.expectation_config.type if self.expectation_config else "unknown" + + return as_typed( + self.result or {}, + expectation_type=expectation_type, + result_format=result_format, + engine_hint=engine_hint, + ) + class ExpectationValidationResultSchema(Schema): success = fields.Bool(required=False, allow_none=True) diff --git a/great_expectations/core/validation_result_schemas/__init__.py b/great_expectations/core/validation_result_schemas/__init__.py new file mode 100644 index 000000000000..25da4c170e43 --- /dev/null +++ b/great_expectations/core/validation_result_schemas/__init__.py @@ -0,0 +1,14 @@ +"""Internal-only typed validation result schemas. + +Re-exports are populated as implementation tasks land. This package is not +added to great_expectations/__init__.py and contains no @public_api symbols. +""" + +from great_expectations.core.validation_result_schemas.dispatcher import ( + ParseError, + Result, + as_typed, + family_for, +) + +__all__ = ["ParseError", "Result", "as_typed", "family_for"] diff --git a/great_expectations/core/validation_result_schemas/dispatcher.py b/great_expectations/core/validation_result_schemas/dispatcher.py new file mode 100644 index 000000000000..f8c28eec8463 --- /dev/null +++ b/great_expectations/core/validation_result_schemas/dispatcher.py @@ -0,0 +1,257 @@ +"""Dispatcher for typed validation result schemas. + +Public API: + as_typed(result_dict, *, expectation_type, result_format, engine_hint=None) -> Result + family_for(expectation_type: str) -> str + Result (Union alias) + ParseError (exception) + +All four are re-exported from ``validation_result_schemas/__init__.py``. + +Import rules (enforced by ruff banned-api): +- Pydantic symbols come exclusively from ``great_expectations.compatibility.pydantic``. +- No PEP 604 unions (``X | Y``); use ``Optional[X]`` or ``Union[X, Y]``. +- No direct ``import pydantic``. +""" + +from __future__ import annotations + +from typing import Any, Dict, Optional, Union + +from great_expectations.compatibility import pydantic +from great_expectations.core.result_format import ResultFormat +from great_expectations.core.validation_result_schemas.schemas.aggregate_result import ( + AggregateBasicResult, + AggregateBooleanOnlyResult, + AggregateCompleteResult, + AggregateSummaryResult, +) +from great_expectations.core.validation_result_schemas.schemas.map_result import ( + MapBasicResult, + MapBooleanOnlyResult, + MapCompleteResult, + MapSummaryResult, +) +from great_expectations.core.validation_result_schemas.schemas.per_expectation_overrides import ( + ExpectColumnValuesToBeOfTypeSqlSparkResult, +) + +# --------------------------------------------------------------------------- +# Public type alias +# --------------------------------------------------------------------------- + +Result = Union[ + MapBooleanOnlyResult, + MapBasicResult, + MapSummaryResult, + MapCompleteResult, + AggregateBooleanOnlyResult, + AggregateBasicResult, + AggregateSummaryResult, + AggregateCompleteResult, + ExpectColumnValuesToBeOfTypeSqlSparkResult, +] + +# --------------------------------------------------------------------------- +# ParseError +# --------------------------------------------------------------------------- + + +class ParseError(Exception): + """Raised when as_typed cannot match result_dict to a registered schema variant. + + Wraps pydantic.ValidationError; message names the unmatched fields and the + candidate variant(s) that were tried. + """ + + +# Module-level error message templates (TRY003: avoid long messages outside exception class). +def _override_parse_error_msg( + expectation_type: str, eff_engine: Optional[str], cls_name: str, exc: object +) -> str: + return f"Failed to parse {expectation_type!r} with engine={eff_engine!r} as {cls_name}: {exc}" + + +def _family_parse_error_msg( + expectation_type: str, fmt_value: str, cls_name: str, exc: object +) -> str: + return f"Failed to parse {expectation_type!r} ({fmt_value}) as {cls_name}: {exc}" + + +# --------------------------------------------------------------------------- +# _FAMILY_TABLE — hand-authored; covers all 60 core expectations +# --------------------------------------------------------------------------- +# +# Map expectations: those extending ColumnMapExpectation, ColumnPairMapExpectation, +# or MulticolumnMapExpectation (32 total). +# +# Aggregate expectations: everything else — ColumnAggregateExpectation, +# BatchExpectation, TableExpectation, etc. (28 total). + +_FAMILY_TABLE: Dict[str, str] = { + # ---- MAP (ColumnMapExpectation) ---------------------------------------- + "expect_column_value_lengths_to_be_between": "map", + "expect_column_value_lengths_to_equal": "map", + "expect_column_value_z_scores_to_be_less_than": "map", + "expect_column_values_to_be_between": "map", + "expect_column_values_to_be_dateutil_parseable": "map", + "expect_column_values_to_be_decreasing": "map", + "expect_column_values_to_be_in_set": "map", + "expect_column_values_to_be_in_type_list": "map", + "expect_column_values_to_be_increasing": "map", + "expect_column_values_to_be_json_parseable": "map", + "expect_column_values_to_be_null": "map", + "expect_column_values_to_be_of_type": "map", + "expect_column_values_to_be_unique": "map", + "expect_column_values_to_match_json_schema": "map", + "expect_column_values_to_match_like_pattern": "map", + "expect_column_values_to_match_like_pattern_list": "map", + "expect_column_values_to_match_regex": "map", + "expect_column_values_to_match_regex_list": "map", + "expect_column_values_to_match_strftime_format": "map", + "expect_column_values_to_not_be_in_set": "map", + "expect_column_values_to_not_be_null": "map", + "expect_column_values_to_not_match_like_pattern": "map", + "expect_column_values_to_not_match_like_pattern_list": "map", + "expect_column_values_to_not_match_regex": "map", + "expect_column_values_to_not_match_regex_list": "map", + # ---- MAP (ColumnPairMapExpectation) ------------------------------------ + "expect_column_pair_values_a_to_be_greater_than_b": "map", + "expect_column_pair_values_to_be_equal": "map", + "expect_column_pair_values_to_be_in_set": "map", + # ---- MAP (MulticolumnMapExpectation) ----------------------------------- + "expect_compound_columns_to_be_unique": "map", + "expect_multicolumn_sum_to_equal": "map", + "expect_multicolumn_values_to_be_unique": "map", + "expect_select_column_values_to_be_unique_within_record": "map", + # ---- AGGREGATE (ColumnAggregateExpectation) ---------------------------- + "expect_column_bootstrapped_ks_test_p_value_to_be_greater_than": "aggregate", + "expect_column_chisquare_test_p_value_to_be_greater_than": "aggregate", + "expect_column_distinct_values_to_be_in_set": "aggregate", + "expect_column_distinct_values_to_contain_set": "aggregate", + "expect_column_distinct_values_to_equal_set": "aggregate", + "expect_column_kl_divergence_to_be_less_than": "aggregate", + "expect_column_max_to_be_between": "aggregate", + "expect_column_mean_to_be_between": "aggregate", + "expect_column_median_to_be_between": "aggregate", + "expect_column_min_to_be_between": "aggregate", + "expect_column_most_common_value_to_be_in_set": "aggregate", + "expect_column_pair_cramers_phi_value_to_be_less_than": "aggregate", + "expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than": "aggregate", + "expect_column_proportion_of_non_null_values_to_be_between": "aggregate", + "expect_column_proportion_of_unique_values_to_be_between": "aggregate", + "expect_column_quantile_values_to_be_between": "aggregate", + "expect_column_stdev_to_be_between": "aggregate", + "expect_column_sum_to_be_between": "aggregate", + "expect_column_to_exist": "aggregate", + "expect_column_unique_value_count_to_be_between": "aggregate", + # ---- AGGREGATE (TableExpectation / BatchExpectation) ------------------- + "expect_query_results_to_match_comparison": "aggregate", + "expect_table_column_count_to_be_between": "aggregate", + "expect_table_column_count_to_equal": "aggregate", + "expect_table_columns_to_match_ordered_list": "aggregate", + "expect_table_columns_to_match_set": "aggregate", + "expect_table_row_count_to_be_between": "aggregate", + "expect_table_row_count_to_equal": "aggregate", + "expect_table_row_count_to_equal_other_table": "aggregate", +} + +# --------------------------------------------------------------------------- +# _OVERRIDE_TABLE — per-expectation engine-specific class overrides +# --------------------------------------------------------------------------- + +_OVERRIDE_TABLE: Dict[str, Dict[str, Any]] = { + "expect_column_values_to_be_of_type": { + "sql": ExpectColumnValuesToBeOfTypeSqlSparkResult, + "spark": ExpectColumnValuesToBeOfTypeSqlSparkResult, + } +} + +# --------------------------------------------------------------------------- +# Format dispatch tables +# --------------------------------------------------------------------------- + +_FORMAT_MAP: Dict[str, Dict[ResultFormat, Any]] = { + "map": { + ResultFormat.BOOLEAN_ONLY: MapBooleanOnlyResult, + ResultFormat.BASIC: MapBasicResult, + ResultFormat.SUMMARY: MapSummaryResult, + ResultFormat.COMPLETE: MapCompleteResult, + }, + "aggregate": { + ResultFormat.BOOLEAN_ONLY: AggregateBooleanOnlyResult, + ResultFormat.BASIC: AggregateBasicResult, + ResultFormat.SUMMARY: AggregateSummaryResult, + ResultFormat.COMPLETE: AggregateCompleteResult, + }, +} + + +# --------------------------------------------------------------------------- +# Public functions +# --------------------------------------------------------------------------- + + +def family_for(expectation_type: str) -> str: + """Return ``'map'`` or ``'aggregate'`` for the given expectation type. + + Falls back to ``'aggregate'`` for unknown types so that novel / third-party + expectations degrade gracefully rather than raising a hard error. + """ + return _FAMILY_TABLE.get(expectation_type, "aggregate") + + +def as_typed( + result_dict: Dict[str, Any], + *, + expectation_type: str, + result_format: ResultFormat, + engine_hint: Optional[str] = None, +) -> Result: + """Dispatch ``result_dict`` to the matching schema variant and return the parsed model. + + Resolution order: + 1. Normalise ``engine_hint``: SQL sniffing when ``engine_hint is None`` and + ``unexpected_index_query`` is present in ``result_dict``. + 2. Per-expectation override table (e.g. SQL/Spark path for + ``expect_column_values_to_be_of_type``). + 3. Family-based dispatch via ``_FORMAT_MAP[family][result_format]``. + + Raises: + ParseError: when pydantic construction fails; message names the + candidate class and the validation error. + """ + # 1. Normalise engine_hint — SQL sniffing + eff_engine = engine_hint + if eff_engine is None and "unexpected_index_query" in result_dict: + eff_engine = "sql" + + # 2. Per-expectation override + override_engines = _OVERRIDE_TABLE.get(expectation_type, {}) + if eff_engine in override_engines: + schema_cls = override_engines[eff_engine] + try: + return schema_cls(**result_dict) + except pydantic.ValidationError as exc: + raise ParseError( + _override_parse_error_msg(expectation_type, eff_engine, schema_cls.__name__, exc) + ) from exc + + # 3. Family-based dispatch + family = family_for(expectation_type) + schema_cls = _FORMAT_MAP[family][result_format] + + # Pass engine_hint into the model only for map-family schemas. Map schemas + # declare ``engine_hint`` as a field (MapResultBase) so that root validators + # can enforce SQL-required fields. Aggregate schemas do not declare the field + # and use ``extra = Extra.forbid``, so injecting it would raise a ValidationError. + data = dict(result_dict) + if eff_engine is not None and family == "map": + data["engine_hint"] = eff_engine + + try: + return schema_cls(**data) + except pydantic.ValidationError as exc: + raise ParseError( + _family_parse_error_msg(expectation_type, result_format.value, schema_cls.__name__, exc) + ) from exc diff --git a/great_expectations/core/validation_result_schemas/field_validators.py b/great_expectations/core/validation_result_schemas/field_validators.py new file mode 100644 index 000000000000..15a5acc4029f --- /dev/null +++ b/great_expectations/core/validation_result_schemas/field_validators.py @@ -0,0 +1,137 @@ +"""Reusable pydantic v1 field validators for validation result schemas. + +All validators are pure functions intended to be bound to schema classes via +``pydantic.validator`` and ``pydantic.root_validator``. They are defined once +here and imported by every schema family so that per-format classes stay thin. + +Import rules (enforced by ruff banned-api): +- Pydantic symbols come exclusively from ``great_expectations.compatibility.pydantic``. +- ``RuntimeTypeName`` comes from ``validation_result_schemas.types``. +- No direct ``import pydantic``, no PEP 604 unions. +""" + +from __future__ import annotations + +from typing import Any, Optional + +from great_expectations.core.validation_result_schemas.types import RuntimeTypeName + +# --------------------------------------------------------------------------- +# Runtime-type classifier +# --------------------------------------------------------------------------- + +# Module-level type map used by classify_runtime_type. +# bool is intentionally excluded — it must be checked before int (bool is a +# subclass of int), so it gets its own explicit branch in the function. +_RUNTIME_TYPE_MAP: dict = { + int: RuntimeTypeName.INT, + float: RuntimeTypeName.FLOAT, + str: RuntimeTypeName.STR, + list: RuntimeTypeName.LIST, + dict: RuntimeTypeName.DICT, +} + +# Module-level constant for the SQL engine validation error message (TRY003). +_SQL_INDEX_QUERY_REQUIRED_MSG = ( + "unexpected_index_query is required when engine_hint='sql' and " + "return_unexpected_index_query=True, but it was not found in the " + "result dict. This indicates a schema mismatch for this SQL engine " + "and ResultFormat combination." +) + + +def classify_runtime_type(value: Any) -> RuntimeTypeName: + """Classify the runtime type of a heterogeneous field (e.g., unexpected_rows). + + Returns a stable ``RuntimeTypeName`` enum value used in findings metadata. + Never raises — all branches end in a known enum member. + + Handles pyspark and pandas DataFrames by inspecting ``type(v).__module__`` + and ``type(v).__name__`` so that neither library needs to be imported at + module load time. + """ + if value is None: + return RuntimeTypeName.NONE + + # Check bool before int — bool is a subclass of int in Python + if isinstance(value, bool): + return RuntimeTypeName.BOOL + + for t, name in _RUNTIME_TYPE_MAP.items(): + if isinstance(value, t): + return name + + # DataFrame detection without importing the package + type_name = type(value).__name__ + module = type(value).__module__ + if type_name == "DataFrame" and not module.startswith("pyspark"): + return RuntimeTypeName.DATAFRAME_PANDAS + if "pyspark" in module: + return RuntimeTypeName.DATAFRAME_SPARK + + return RuntimeTypeName.OTHER + + +# --------------------------------------------------------------------------- +# Field validators (pydantic v1 style — bound by callers via validator()) +# --------------------------------------------------------------------------- + + +def validate_unexpected_rows_passthrough(cls: Any, v: Any) -> Any: + """v1 validator for ``unexpected_rows``. + + Accepts any runtime type; the matrix runner records the actual type via + ``classify_runtime_type`` for findings. Does **not** raise on type mismatch + — the schema accepts ``Any`` for this field because the runtime type differs + across execution engines (pandas DataFrame, list[dict] on SQL, Spark frame). + """ + return v + + +def validate_partial_unexpected_counts_fallback(cls: Any, v: Optional[list]) -> Optional[list]: + """v1 validator for ``partial_unexpected_counts``. + + Accepts the two documented shapes: + - Canonical: ``[{"value": x, "count": n}, ...]`` + - Error fallback: ``[{"error": "partial_exception_counts requires a hashable type"}]`` + - ``None`` + + Both shapes are returned unchanged — the validator is a passthrough that + exists so the schema explicitly acknowledges the fallback rather than + inadvertently forbidding it. + """ + return v + + +# --------------------------------------------------------------------------- +# Root validator +# --------------------------------------------------------------------------- + + +def root_validate_engine_required_fields(cls: Any, values: dict) -> dict: + """v1 root_validator for SQL engine-required fields. + + If ``engine_hint`` is ``"sql"`` and ``return_unexpected_index_query`` is + ``True``, asserts that ``unexpected_index_query`` is present (non-None) in + the parsed values dict. All other combinations are a no-op. + + Engine hint is read from the ``engine_hint`` key in the ``values`` dict. + Schemas that do not declare ``engine_hint`` as a field will simply not have + the key, and the check is skipped — ensuring the validator is safe to include + in any schema regardless of whether the dispatcher sets the hint. + """ + engine_hint = values.get("engine_hint") + + if engine_hint != "sql": + # Not a SQL engine (or no hint): skip the SQL-specific assertion + return values + + if not values.get("return_unexpected_index_query"): + # SQL engine, but the query was not requested: no assertion needed + return values + + # SQL engine + query was requested: unexpected_index_query must be present + if not values.get("unexpected_index_query"): + raise ValueError(_SQL_INDEX_QUERY_REQUIRED_MSG) + + return values diff --git a/great_expectations/core/validation_result_schemas/findings_emitter.py b/great_expectations/core/validation_result_schemas/findings_emitter.py new file mode 100644 index 000000000000..7ddf43d9b164 --- /dev/null +++ b/great_expectations/core/validation_result_schemas/findings_emitter.py @@ -0,0 +1,112 @@ +"""Findings file writer for validation result schemas. + +Per-run-id findings file writer that emits a deterministic JSON envelope. + +Construction resolves the output directory: + 1. ``output_dir`` argument if provided + 2. environment variable GX_VALIDATION_FINDINGS_DIR if set + 3. else _DEFAULT_DIR (gitignored in the gx repo) + +The filename is ``f"{run_id}.json"``. Findings are accumulated in memory and +flushed on ``close()``; the file is written atomically (write to ``.tmp``, +then ``Path.replace``). Within a file, findings are sorted by +``(expectation_type, engine, result_format)`` for deterministic diffs across +runs. +""" + +from __future__ import annotations + +import json +import os +from datetime import datetime, timezone +from pathlib import Path +from typing import TYPE_CHECKING, List, Optional + +if TYPE_CHECKING: + from typing_extensions import Self + + from great_expectations.core.validation_result_schemas.types import Finding + +_DEFAULT_DIR: Path = Path("tests/_artifacts/validation_result_schemas/findings") +_ENV_VAR: str = "GX_VALIDATION_FINDINGS_DIR" +SCHEMA_VERSION: int = 1 + + +def _get_gx_version() -> str: + """Return the installed great_expectations version string.""" + try: + import great_expectations + + return str(great_expectations.__version__) + except (ImportError, AttributeError): + return "unknown" + + +class FindingsWriter: + """Per-run-id findings file writer. + + Construction resolves the output directory: + 1. environment variable GX_VALIDATION_FINDINGS_DIR if set + 2. else _DEFAULT_DIR (gitignored in the gx repo) + + The filename is f"{run_id}.json". Findings are appended in memory and + flushed on close(); the file is written atomically (write to .tmp, rename). + Within a file, findings are sorted by (expectation_type, engine, + result_format) for deterministic diffs across runs. + """ + + def __init__(self, run_id: str, output_dir: Optional[Path] = None) -> None: + self._run_id = run_id + self._findings: List[Finding] = [] + self._started_at_utc: str = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + + # Directory resolution: arg → env var → _DEFAULT_DIR + if output_dir is not None: + self._output_dir = Path(output_dir) + else: + env_val = os.environ.get(_ENV_VAR) # noqa: TID251 # os.environ allowed in config files + if env_val is not None: + self._output_dir = Path(env_val) + else: + self._output_dir = _DEFAULT_DIR + + self._output_dir.mkdir(parents=True, exist_ok=True) + + def write_finding(self, finding: Finding) -> None: + """Append *finding* to the in-memory list.""" + self._findings.append(finding) + + def close(self) -> None: + """Sort findings and write them atomically to the output file.""" + completed_at_utc: str = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + + # Sort deterministically by (expectation_type, engine, result_format) + sorted_findings: List[Finding] = sorted( + self._findings, + key=lambda f: ( + f.get("expectation_type", ""), + f.get("engine", ""), + f.get("result_format", ""), + ), + ) + + envelope = { + "schema_version": SCHEMA_VERSION, + "run_id": self._run_id, + "started_at_utc": self._started_at_utc, + "completed_at_utc": completed_at_utc, + "gx_version": _get_gx_version(), + "findings": sorted_findings, + } + + filepath = self._output_dir / f"{self._run_id}.json" + tmp_path = Path(str(filepath) + ".tmp") + + tmp_path.write_text(json.dumps(envelope, indent=2)) + tmp_path.replace(filepath) + + def __enter__(self) -> Self: + return self + + def __exit__(self, *exc: object) -> None: + self.close() diff --git a/great_expectations/core/validation_result_schemas/format_config.py b/great_expectations/core/validation_result_schemas/format_config.py new file mode 100644 index 000000000000..5e4782092a24 --- /dev/null +++ b/great_expectations/core/validation_result_schemas/format_config.py @@ -0,0 +1,30 @@ +"""ResultFormatConfig TypedDict for internal use by the validation result dispatcher. + +These types are not part of the public API and must not be exported via +great_expectations/__init__.py or decorated with @public_api. +""" + +from __future__ import annotations + +from typing import TypedDict + + +class ResultFormatConfigRequired(TypedDict): + """Required keys always present in a parsed result-format config dict.""" + + result_format: str # one of the 4 ResultFormat enum values + partial_unexpected_count: int + include_unexpected_rows: bool + map_expectation_unexpected_rows_as_dict: bool + + +class ResultFormatConfig(ResultFormatConfigRequired, total=False): + """Full result-format config dict including optional keys. + + The two-class overlay pattern (required base + total=False subclass) lets us + express "required + optional" without NotRequired[...], which requires + Python 3.11+. This keeps the code parseable on Python 3.10. + """ + + exclude_unexpected_values: bool + return_unexpected_index_query: bool diff --git a/great_expectations/core/validation_result_schemas/schemas/__init__.py b/great_expectations/core/validation_result_schemas/schemas/__init__.py new file mode 100644 index 000000000000..717183678941 --- /dev/null +++ b/great_expectations/core/validation_result_schemas/schemas/__init__.py @@ -0,0 +1,29 @@ +"""Schema family re-exports. Populated as schema tasks land.""" + +from great_expectations.core.validation_result_schemas.schemas.aggregate_result import ( + AggregateBasicResult, + AggregateBooleanOnlyResult, + AggregateCompleteResult, + AggregateSummaryResult, +) +from great_expectations.core.validation_result_schemas.schemas.map_result import ( + MapBasicResult, + MapBooleanOnlyResult, + MapCompleteResult, + MapSummaryResult, +) +from great_expectations.core.validation_result_schemas.schemas.per_expectation_overrides import ( + ExpectColumnValuesToBeOfTypeSqlSparkResult, +) + +__all__ = [ + "AggregateBasicResult", + "AggregateBooleanOnlyResult", + "AggregateCompleteResult", + "AggregateSummaryResult", + "ExpectColumnValuesToBeOfTypeSqlSparkResult", + "MapBasicResult", + "MapBooleanOnlyResult", + "MapCompleteResult", + "MapSummaryResult", +] diff --git a/great_expectations/core/validation_result_schemas/schemas/aggregate_result.py b/great_expectations/core/validation_result_schemas/schemas/aggregate_result.py new file mode 100644 index 000000000000..0ba9f0ddb708 --- /dev/null +++ b/great_expectations/core/validation_result_schemas/schemas/aggregate_result.py @@ -0,0 +1,91 @@ +"""Aggregate-style validation result schema family. + +Covers AggregateExpectation types (column-level aggregate expectations such as +expect_column_mean_to_be_between, expect_column_min_to_be_between, etc.). + +Four format-discriminated classes share a common base: + + AggregateResultBase + ├── AggregateBooleanOnlyResult (BOOLEAN_ONLY) + └── AggregateBasicResult (BASIC) + └── AggregateSummaryResult (SUMMARY) + └── AggregateCompleteResult (COMPLETE) + +Import rules (enforced by ruff banned-api): +- Pydantic symbols come exclusively from ``great_expectations.compatibility.pydantic``. +- No PEP 604 unions (``X | Y``); use ``Optional[X]`` or ``Union[X, Y]``. +- No direct ``import pydantic``. +""" + +from __future__ import annotations + +from typing import Any, Dict, List, Optional, Union + +from great_expectations.compatibility import pydantic +from great_expectations.compatibility.pydantic import BaseModel + +# Scalar = Union[int, float, str, bool, None]; observed_value is broadly typed. +# Union order matters for pydantic v1: bool must come before int (bool is a subclass of +# int); float must come before int to avoid coercion of 3.14 → 3. Putting the more +# specific numeric types first avoids silent coercion. +ObservedValue = Union[bool, float, int, str, List[Any], Dict[str, Any], None] + + +class AggregateResultBase(BaseModel): + """Base for all aggregate-style result models. + + Fields here are the always-allowed superset shared by every format variant. + ``extra = Extra.forbid`` is intentional: the matrix runner *wants* unexpected + fields to fail validation so they surface in findings as cleanup queue entries. + """ + + class Config: + extra = pydantic.Extra.forbid + arbitrary_types_allowed = True + + observed_value: ObservedValue = None + details: Optional[Dict[str, Any]] = None + + +class AggregateBooleanOnlyResult(AggregateResultBase): + """ResultFormat.BOOLEAN_ONLY — typically empty result dict for aggregate expectations. + + The parent EVR carries ``success``. The result dict for BOOLEAN_ONLY + aggregate expectations typically has no additional fields. + """ + + pass # BOOLEAN_ONLY: typically empty + + +class AggregateBasicResult(AggregateResultBase): + """ResultFormat.BASIC — counts, percents, and partial lists. + + Note: ``unexpected_count`` is included here because a subset of aggregate + expectations (e.g. ``expect_column_distinct_values_to_equal_set``) emit + it alongside the standard aggregate fields. It is Optional so that the + majority of aggregate expectations — which do *not* emit it — continue to + validate cleanly. + """ + + element_count: Optional[int] = None + missing_count: Optional[int] = None + missing_percent: Optional[float] = None + unexpected_count: Optional[int] = None + partial_unexpected_list: Optional[List[Any]] = None + partial_missing_list: Optional[List[Any]] = None + + +class AggregateSummaryResult(AggregateBasicResult): + """ResultFormat.SUMMARY — aggregate expectations rarely diverge from BASIC. + + Kept explicit so the dispatcher can name it distinctly. + """ + + pass # Aggregate expectations rarely diverge between BASIC and SUMMARY + + +class AggregateCompleteResult(AggregateSummaryResult): + """ResultFormat.COMPLETE — adds the full unexpected list and index list.""" + + unexpected_list: Optional[List[Any]] = None + unexpected_index_list: Optional[List[Any]] = None diff --git a/great_expectations/core/validation_result_schemas/schemas/map_result.py b/great_expectations/core/validation_result_schemas/schemas/map_result.py new file mode 100644 index 000000000000..b4ac3f42f203 --- /dev/null +++ b/great_expectations/core/validation_result_schemas/schemas/map_result.py @@ -0,0 +1,118 @@ +"""Map-style validation result schema family. + +Covers ColumnMapExpectation (26), ColumnPairMapExpectation (3), and +MulticolumnMapExpectation (3) = 32 map-style core expectations. + +Four format-discriminated classes share a common base: + + MapResultBase + ├── MapBooleanOnlyResult (BOOLEAN_ONLY) + └── MapBasicResult (BASIC) + └── MapSummaryResult (SUMMARY) + └── MapCompleteResult (COMPLETE) + +Import rules (enforced by ruff banned-api): +- Pydantic symbols come exclusively from ``great_expectations.compatibility.pydantic``. +- No PEP 604 unions (``X | Y``); use ``Optional[X]`` or ``Union[X, Y]``. +- No direct ``import pydantic``. +""" + +from __future__ import annotations + +from typing import Any, List, Optional + +from great_expectations.compatibility import pydantic +from great_expectations.compatibility.pydantic import BaseModel +from great_expectations.core.validation_result_schemas.field_validators import ( + root_validate_engine_required_fields, + validate_partial_unexpected_counts_fallback, + validate_unexpected_rows_passthrough, +) + + +class MapResultBase(BaseModel): + """Base for all map-style result models. + + Fields here are the always-allowed superset shared by every format variant. + ``extra = Extra.forbid`` is intentional: the matrix runner *wants* unexpected + fields to fail validation so they surface in findings as cleanup queue entries. + """ + + class Config: + extra = pydantic.Extra.forbid + arbitrary_types_allowed = True + + # Internal engine hint — declared as a normal field so it appears in the + # values dict during root validation. ``exclude=True`` is not used here + # because pydantic v1's per-field exclude is Config-based; callers that want + # to omit this field from .dict() output should call .dict(exclude={"engine_hint"}). + engine_hint: Optional[str] = None + + # SQL-only, optional everywhere; root validator enforces presence when applicable + unexpected_index_query: Optional[str] = None + unexpected_index_column_names: Optional[List[str]] = None + + +class MapBooleanOnlyResult(MapResultBase): + """ResultFormat.BOOLEAN_ONLY — empty result dict for map expectations. + + The parent EVR carries ``success``. The result dict may carry only the + SQL index-query overflow fields when ``return_unexpected_index_query=True``. + """ + + pass # No additional fields beyond the SQL index-query fields in base + + +class MapBasicResult(MapResultBase): + """ResultFormat.BASIC — counts, percents, and the partial unexpected list. + + Note: ``observed_value`` is included here because a small set of map + expectations (e.g. ``expect_column_values_to_be_of_type``, + ``expect_column_values_to_be_in_type_list``) emit it alongside the + standard map fields on the pandas engine path. It is Optional so that + the majority of map expectations — which do *not* emit it — continue to + validate cleanly. + """ + + element_count: Optional[int] = None + unexpected_count: Optional[int] = None + unexpected_percent: Optional[float] = None + missing_count: Optional[int] = None + missing_percent: Optional[float] = None + unexpected_percent_total: Optional[float] = None + unexpected_percent_nonmissing: Optional[float] = None + partial_unexpected_list: Optional[List[Any]] = None + # Some map expectations (e.g. expect_column_values_to_be_of_type on pandas) + # emit observed_value alongside the standard map fields. + observed_value: Optional[Any] = None + # engine-typed; classified at runtime, not validated by type + unexpected_rows: Any = None + + _validate_rows = pydantic.validator("unexpected_rows", pre=True, allow_reuse=True)( + validate_unexpected_rows_passthrough + ) + + +class MapSummaryResult(MapBasicResult): + """ResultFormat.SUMMARY — adds counts and index list for partial unexpected.""" + + partial_unexpected_counts: Optional[List[Any]] = None + partial_unexpected_index_list: Optional[List[Any]] = None + + _validate_counts = pydantic.validator("partial_unexpected_counts", pre=True, allow_reuse=True)( + validate_partial_unexpected_counts_fallback + ) + + +class MapCompleteResult(MapSummaryResult): + """ResultFormat.COMPLETE — adds the full unexpected list and index list. + + Also carries the root validator that enforces SQL engine-required fields: + when ``engine_hint='sql'`` and ``return_unexpected_index_query=True``, + ``unexpected_index_query`` must be present. + """ + + unexpected_list: Optional[List[Any]] = None + unexpected_index_list: Optional[List[Any]] = None + + _root_validate = pydantic.root_validator(allow_reuse=True)(root_validate_engine_required_fields) diff --git a/great_expectations/core/validation_result_schemas/schemas/per_expectation_overrides.py b/great_expectations/core/validation_result_schemas/schemas/per_expectation_overrides.py new file mode 100644 index 000000000000..7e9246666440 --- /dev/null +++ b/great_expectations/core/validation_result_schemas/schemas/per_expectation_overrides.py @@ -0,0 +1,33 @@ +"""Per-expectation schema overrides. + +Some expectations emit result dicts that do not fit the generic map or +aggregate families. Each override here is a standalone Pydantic model with +``extra = Extra.forbid`` so unexpected fields surface as validation errors. + +Import rules (enforced by ruff banned-api): +- Pydantic symbols come exclusively from ``great_expectations.compatibility.pydantic``. +- No PEP 604 unions (``X | Y``); use ``Optional[X]`` or ``Union[X, Y]``. +- No direct ``import pydantic``. +""" + +from __future__ import annotations + +from typing import Optional + +from great_expectations.compatibility import pydantic +from great_expectations.compatibility.pydantic import BaseModel + + +class ExpectColumnValuesToBeOfTypeSqlSparkResult(BaseModel): + """ExpectColumnValuesToBeOfType bypasses _format_map_output on SQL/Spark. + + For BASIC / SUMMARY / COMPLETE formats, the result dict contains only + ``{observed_value: }``. For BOOLEAN_ONLY format the result + dict is empty ``{}``, so ``observed_value`` must be Optional here to allow + both cases through the same override schema. + """ + + class Config: + extra = pydantic.Extra.forbid + + observed_value: Optional[str] = None diff --git a/great_expectations/core/validation_result_schemas/types.py b/great_expectations/core/validation_result_schemas/types.py new file mode 100644 index 000000000000..2f4c401ca08a --- /dev/null +++ b/great_expectations/core/validation_result_schemas/types.py @@ -0,0 +1,50 @@ +"""Type definitions for validation result schemas. + +Defines the enumeration types and TypedDicts used across the +validation_result_schemas package. +""" + +from __future__ import annotations + +from enum import Enum +from typing import Dict, List, Optional, TypedDict + + +class Status(str, Enum): + PARSED = "parsed" + FAILED = "failed" + + +class RuntimeTypeName(str, Enum): + NONE = "none" + INT = "int" + FLOAT = "float" + STR = "str" + BOOL = "bool" + LIST = "list" + DICT = "dict" + DATAFRAME_PANDAS = "DataFrame" + DATAFRAME_SPARK = "SparkDataFrame" + OTHER = "other" + + +class CellCoordinates(TypedDict): + expectation_type: str + result_format: str # ResultFormat enum value + engine: str # 'pandas' | 'spark' | 'sql' + datasource_test_id: str + + +class Finding(TypedDict, total=False): + expectation_type: str + result_format: str + engine: str + datasource_test_id: str + status: str # Status enum value + raw_field_set: List[str] + raw_field_types: Dict[str, str] # field name -> RuntimeTypeName value + matched_variant: Optional[str] + schema_required_fields_present: List[str] + schema_optional_fields_present: List[str] + schema_extras_rejected: List[str] + error_summary: Optional[str] diff --git a/pyproject.toml b/pyproject.toml index 95b42e486479..9651f0e5998f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -171,6 +171,14 @@ exclude = [ 'tests/validator/test_metric_configuration\.py', 'tests/validator/test_metrics_calculator\.py', 'tests/validator/test_validation_graph\.py', + # validation_result_schemas: pydantic v1 **kwargs unpacking and intentional call-arg tests + 'tests/unit/core/validation_result_schemas/test_format_config\.py', # 13 + 'tests/unit/core/validation_result_schemas/test_schemas_map\.py', # 57 + 'tests/unit/core/validation_result_schemas/test_schemas_aggregate\.py', # 71 + 'tests/unit/core/validation_result_schemas/test_schemas_overrides\.py', # 6 + 'tests/unit/core/validation_result_schemas/test_cases_table\.py', # 1 + 'tests/integration/data_sources_and_expectations/expectations/_validation_result_schemas_cases\.py', # 2 + 'tests/integration/data_sources_and_expectations/expectations/test_validation_result_schemas_matrix\.py', # 11 ] [[tool.mypy.overrides]] @@ -703,6 +711,7 @@ markers = [ "spark: mark a test as Spark-dependent.", "spark_connect: mark a test as Spark Connect-dependent.", "trino: mark a test as trino-dependent.", + "no_xdist: mark a test module that must not be split across xdist workers (session-scoped fixtures with shared state).", "unit: mark a test as a unit test.", "v2_api: mark test as specific to the v2 api (e.g. pre Data Connectors).", ] diff --git a/tests/conftest.py b/tests/conftest.py index c88af003fcef..34e05775a9ef 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -287,6 +287,12 @@ def pytest_addoption(parser): action="store_true", help="If set, run performance tests (which might also require additional arguments like --bigquery)", # noqa: E501 # FIXME CoP ) + parser.addoption( + "--vrs-run-id", + action="store", + default=None, + help="Run ID for validation result schema matrix findings file (optional; auto-generated if not set).", # noqa: E501 + ) def build_test_backends_list_v2_api(metafunc): diff --git a/tests/integration/data_sources_and_expectations/expectations/_validation_result_schemas_cases.py b/tests/integration/data_sources_and_expectations/expectations/_validation_result_schemas_cases.py new file mode 100644 index 000000000000..d6468a5517fc --- /dev/null +++ b/tests/integration/data_sources_and_expectations/expectations/_validation_result_schemas_cases.py @@ -0,0 +1,506 @@ +"""EXPECTATION_CASES — one entry per core expectation. + +Underscore-prefixed so pytest does not collect this file. + +Three expectations (ExpectColumnBootstrappedKsTestPValueToBeGreaterThan, +ExpectColumnChiSquareTestPValueToBeGreaterThan, and +ExpectColumnParameterizedDistributionKsTestPValueToBeGreaterThan) are marked +``NotImplementedError`` stubs in the codebase (their ``__init__`` raises and +they are not part of the public ``gxe`` API). We represent them with a +lightweight ``_AbstractStub`` object that carries the correct +``expectation_type`` string so that the ``family_for`` lookup test still passes. +""" + +from __future__ import annotations + +from typing import List, Mapping, NamedTuple, Optional + +import pandas as pd + +import great_expectations.expectations as gxe + +# --------------------------------------------------------------------------- +# Default fixture data — small DataFrame covering the most common columns. +# Several rows intentionally violate common constraints (None, type mismatch) +# so result dicts are non-trivial for map expectations. +# --------------------------------------------------------------------------- + +_DEFAULT_DATA = pd.DataFrame( + { + "col_a": [1, 2, 3, None, 5], + "col_b": ["x", "y", "z", "w", None], + "col_c": [1.0, 2.0, None, 4.0, 5.0], + } +) + +# Multi-column / pair data — non-null values in every cell used for pair/multi +# expectations so at least one row satisfies A > B and A == B variants. +_PAIR_DATA = pd.DataFrame( + { + "col_a": [3, 5, 7, 10, 2], + "col_b": [1, 2, 3, 4, 1], + } +) + +# Numeric-only data for z-score and stdev expectations. +_NUMERIC_DATA = pd.DataFrame( + { + "col_a": [10, 20, 30, 40, 50], + "col_b": [1, 2, 3, 4, 5], + "col_c": [1.5, 2.5, 3.5, 4.5, 5.5], + } +) + +# Date-formatted strings for strftime / dateutil expectations. +_DATE_DATA = pd.DataFrame( + { + "col_a": ["2024-01-01", "2024-06-15", "not-a-date", "2023-12-31", "2025-03-01"], + "col_b": [1, 2, 3, 4, 5], + "col_c": [1.0, 2.0, 3.0, 4.0, 5.0], + } +) + +# JSON-formatted strings for JSON expectations. +_JSON_DATA = pd.DataFrame( + { + "col_a": ['{"a": 1}', '{"b": 2}', "not-json", '{"c": 3}', '{"d": 4}'], + "col_b": [1, 2, 3, 4, 5], + "col_c": [1.0, 2.0, 3.0, 4.0, 5.0], + } +) + + +# --------------------------------------------------------------------------- +# Stub for abstract/NotImplementedError expectations +# --------------------------------------------------------------------------- + + +class _AbstractStub: + """Minimal stand-in for the three incomplete core expectations. + + These classes raise ``NotImplementedError`` on ``__init__`` and therefore + cannot be instantiated. We store just the ``expectation_type`` string so + the test assertions that touch ``case.expectation.expectation_type`` work + correctly. + """ + + def __init__(self, expectation_type: str) -> None: + self.expectation_type = expectation_type + + +# --------------------------------------------------------------------------- +# ExpectationCase definition +# --------------------------------------------------------------------------- + + +class ExpectationCase(NamedTuple): + """A single test case for a core expectation. + + Attributes: + id: Unique snake_case identifier matching the file name (e.g. + ``"expect_column_values_to_not_be_null"``). + expectation: An instantiated Expectation (or _AbstractStub for the + three not-yet-migrated expectations). + data: A small pandas DataFrame that serves as the fixture for this + case. Column names must align with whatever column/column_list + arguments are given to the expectation. + extra_data: Optional mapping of named extra DataFrames (e.g. for + expectations that reference a second table). + """ + + id: str + expectation: object # type: ignore[assignment] # Expectation or _AbstractStub + data: pd.DataFrame + extra_data: Optional[Mapping[str, pd.DataFrame]] = None + + +# --------------------------------------------------------------------------- +# EXPECTATION_CASES — one entry per expect_*.py file under core/ +# --------------------------------------------------------------------------- + +EXPECTATION_CASES: List[ExpectationCase] = [ + # ------------------------------------------------------------------ + # MAP — ColumnMapExpectation + # ------------------------------------------------------------------ + ExpectationCase( + id="expect_column_value_lengths_to_be_between", + expectation=gxe.ExpectColumnValueLengthsToBeBetween( + column="col_b", min_value=1, max_value=5 + ), + data=_DEFAULT_DATA, + ), + ExpectationCase( + id="expect_column_value_lengths_to_equal", + expectation=gxe.ExpectColumnValueLengthsToEqual(column="col_b", value=1), + data=_DEFAULT_DATA, + ), + ExpectationCase( + id="expect_column_value_z_scores_to_be_less_than", + expectation=gxe.ExpectColumnValueZScoresToBeLessThan( + column="col_a", threshold=3.0, double_sided=True + ), + data=_NUMERIC_DATA, + ), + ExpectationCase( + id="expect_column_values_to_be_between", + expectation=gxe.ExpectColumnValuesToBeBetween(column="col_a", min_value=0, max_value=10), + data=_DEFAULT_DATA, + ), + ExpectationCase( + id="expect_column_values_to_be_dateutil_parseable", + expectation=gxe.ExpectColumnValuesToBeDateutilParseable(column="col_a"), + data=_DATE_DATA, + ), + ExpectationCase( + id="expect_column_values_to_be_decreasing", + expectation=gxe.ExpectColumnValuesToBeDecreasing(column="col_a"), + data=_NUMERIC_DATA, + ), + ExpectationCase( + id="expect_column_values_to_be_in_set", + expectation=gxe.ExpectColumnValuesToBeInSet(column="col_a", value_set=[1, 2, 3, None, 5]), + data=_DEFAULT_DATA, + ), + ExpectationCase( + id="expect_column_values_to_be_in_type_list", + expectation=gxe.ExpectColumnValuesToBeInTypeList( + column="col_a", type_list=["int", "float", "NoneType"] + ), + data=_DEFAULT_DATA, + ), + ExpectationCase( + id="expect_column_values_to_be_increasing", + expectation=gxe.ExpectColumnValuesToBeIncreasing(column="col_a"), + data=_NUMERIC_DATA, + ), + ExpectationCase( + id="expect_column_values_to_be_json_parseable", + expectation=gxe.ExpectColumnValuesToBeJsonParseable(column="col_a"), + data=_JSON_DATA, + ), + ExpectationCase( + id="expect_column_values_to_be_null", + expectation=gxe.ExpectColumnValuesToBeNull(column="col_a"), + data=_DEFAULT_DATA, + ), + ExpectationCase( + id="expect_column_values_to_be_of_type", + expectation=gxe.ExpectColumnValuesToBeOfType(column="col_a", type_="int"), + data=_DEFAULT_DATA, + ), + ExpectationCase( + id="expect_column_values_to_be_unique", + expectation=gxe.ExpectColumnValuesToBeUnique(column="col_a"), + data=_DEFAULT_DATA, + ), + ExpectationCase( + id="expect_column_values_to_match_json_schema", + expectation=gxe.ExpectColumnValuesToMatchJsonSchema( + column="col_a", json_schema={"type": "object"} + ), + data=_JSON_DATA, + ), + ExpectationCase( + id="expect_column_values_to_match_like_pattern", + expectation=gxe.ExpectColumnValuesToMatchLikePattern(column="col_b", like_pattern="%"), + data=_DEFAULT_DATA, + ), + ExpectationCase( + id="expect_column_values_to_match_like_pattern_list", + expectation=gxe.ExpectColumnValuesToMatchLikePatternList( + column="col_b", like_pattern_list=["%x%", "%y%"] + ), + data=_DEFAULT_DATA, + ), + ExpectationCase( + id="expect_column_values_to_match_regex", + expectation=gxe.ExpectColumnValuesToMatchRegex(column="col_b", regex="^[a-z]$"), + data=_DEFAULT_DATA, + ), + ExpectationCase( + id="expect_column_values_to_match_regex_list", + expectation=gxe.ExpectColumnValuesToMatchRegexList( + column="col_b", regex_list=["^[a-z]$", "^[A-Z]$"] + ), + data=_DEFAULT_DATA, + ), + ExpectationCase( + id="expect_column_values_to_match_strftime_format", + expectation=gxe.ExpectColumnValuesToMatchStrftimeFormat( + column="col_a", strftime_format="%Y-%m-%d" + ), + data=_DATE_DATA, + ), + ExpectationCase( + id="expect_column_values_to_not_be_in_set", + expectation=gxe.ExpectColumnValuesToNotBeInSet(column="col_a", value_set=[99, 100]), + data=_DEFAULT_DATA, + ), + ExpectationCase( + id="expect_column_values_to_not_be_null", + expectation=gxe.ExpectColumnValuesToNotBeNull(column="col_a"), + data=_DEFAULT_DATA, + ), + ExpectationCase( + id="expect_column_values_to_not_match_like_pattern", + expectation=gxe.ExpectColumnValuesToNotMatchLikePattern( + column="col_b", like_pattern="%z%z%" + ), + data=_DEFAULT_DATA, + ), + ExpectationCase( + id="expect_column_values_to_not_match_like_pattern_list", + expectation=gxe.ExpectColumnValuesToNotMatchLikePatternList( + column="col_b", like_pattern_list=["%99%", "%100%"] + ), + data=_DEFAULT_DATA, + ), + ExpectationCase( + id="expect_column_values_to_not_match_regex", + expectation=gxe.ExpectColumnValuesToNotMatchRegex(column="col_b", regex="^[0-9]+$"), + data=_DEFAULT_DATA, + ), + ExpectationCase( + id="expect_column_values_to_not_match_regex_list", + expectation=gxe.ExpectColumnValuesToNotMatchRegexList( + column="col_b", regex_list=["^[0-9]+$", "^[A-Z]+$"] + ), + data=_DEFAULT_DATA, + ), + # ------------------------------------------------------------------ + # MAP — ColumnPairMapExpectation + # ------------------------------------------------------------------ + ExpectationCase( + id="expect_column_pair_values_a_to_be_greater_than_b", + expectation=gxe.ExpectColumnPairValuesAToBeGreaterThanB(column_A="col_a", column_B="col_b"), + data=_PAIR_DATA, + ), + ExpectationCase( + id="expect_column_pair_values_to_be_equal", + expectation=gxe.ExpectColumnPairValuesToBeEqual(column_A="col_a", column_B="col_b"), + data=_PAIR_DATA, + ), + ExpectationCase( + id="expect_column_pair_values_to_be_in_set", + expectation=gxe.ExpectColumnPairValuesToBeInSet( + column_A="col_a", + column_B="col_b", + value_pairs_set=[(3, 1), (5, 2), (7, 3), (10, 4), (2, 1)], + ), + data=_PAIR_DATA, + ), + # ------------------------------------------------------------------ + # MAP — MulticolumnMapExpectation + # ------------------------------------------------------------------ + ExpectationCase( + id="expect_compound_columns_to_be_unique", + expectation=gxe.ExpectCompoundColumnsToBeUnique(column_list=["col_a", "col_b"]), + data=_DEFAULT_DATA, + ), + ExpectationCase( + id="expect_multicolumn_sum_to_equal", + expectation=gxe.ExpectMulticolumnSumToEqual(column_list=["col_a", "col_b"], sum_total=3), + data=pd.DataFrame( + { + "col_a": [1, 2, 3, None, 2], + "col_b": [2, 1, 0, None, 1], + } + ), + ), + ExpectationCase( + id="expect_multicolumn_values_to_be_unique", + # This expectation lacks a map_metric so is_abstract() returns True and + # expectation_type is '' — instantiation succeeds but the expectation_type + # string would be empty. Use _AbstractStub to carry the correct type string. + expectation=_AbstractStub("expect_multicolumn_values_to_be_unique"), + data=_DEFAULT_DATA, + ), + ExpectationCase( + id="expect_select_column_values_to_be_unique_within_record", + expectation=gxe.ExpectSelectColumnValuesToBeUniqueWithinRecord( + column_list=["col_a", "col_b"] + ), + data=_DEFAULT_DATA, + ), + # ------------------------------------------------------------------ + # AGGREGATE — ColumnAggregateExpectation + # ------------------------------------------------------------------ + ExpectationCase( + id="expect_column_bootstrapped_ks_test_p_value_to_be_greater_than", + # This expectation is not yet migrated; __init__ raises NotImplementedError. + expectation=_AbstractStub("expect_column_bootstrapped_ks_test_p_value_to_be_greater_than"), + data=_NUMERIC_DATA, + ), + ExpectationCase( + id="expect_column_chisquare_test_p_value_to_be_greater_than", + # Not yet migrated; __init__ raises NotImplementedError. + expectation=_AbstractStub("expect_column_chisquare_test_p_value_to_be_greater_than"), + data=_DEFAULT_DATA, + ), + ExpectationCase( + id="expect_column_distinct_values_to_be_in_set", + expectation=gxe.ExpectColumnDistinctValuesToBeInSet( + column="col_a", value_set=[1, 2, 3, None, 5] + ), + data=_DEFAULT_DATA, + ), + ExpectationCase( + id="expect_column_distinct_values_to_contain_set", + expectation=gxe.ExpectColumnDistinctValuesToContainSet(column="col_a", value_set=[1, 2]), + data=_DEFAULT_DATA, + ), + ExpectationCase( + id="expect_column_distinct_values_to_equal_set", + expectation=gxe.ExpectColumnDistinctValuesToEqualSet( + column="col_a", value_set=[1, 2, 3, None, 5] + ), + data=_DEFAULT_DATA, + ), + ExpectationCase( + id="expect_column_kl_divergence_to_be_less_than", + expectation=gxe.ExpectColumnKLDivergenceToBeLessThan( + column="col_a", + partition_object={ + "weights": [0.2, 0.2, 0.2, 0.2, 0.2], + "values": [1, 2, 3, None, 5], + }, + ), + data=_DEFAULT_DATA, + ), + ExpectationCase( + id="expect_column_max_to_be_between", + expectation=gxe.ExpectColumnMaxToBeBetween(column="col_a", min_value=0, max_value=10), + data=_DEFAULT_DATA, + ), + ExpectationCase( + id="expect_column_mean_to_be_between", + expectation=gxe.ExpectColumnMeanToBeBetween(column="col_a", min_value=0, max_value=10), + data=_DEFAULT_DATA, + ), + ExpectationCase( + id="expect_column_median_to_be_between", + expectation=gxe.ExpectColumnMedianToBeBetween(column="col_a", min_value=0, max_value=10), + data=_DEFAULT_DATA, + ), + ExpectationCase( + id="expect_column_min_to_be_between", + expectation=gxe.ExpectColumnMinToBeBetween(column="col_a", min_value=0, max_value=10), + data=_DEFAULT_DATA, + ), + ExpectationCase( + id="expect_column_most_common_value_to_be_in_set", + expectation=gxe.ExpectColumnMostCommonValueToBeInSet(column="col_a", value_set=[1, 2, 3]), + data=_DEFAULT_DATA, + ), + ExpectationCase( + id="expect_column_pair_cramers_phi_value_to_be_less_than", + # Uses column_A / column_B (not in the public gxe API as of this version; + # import directly from the core module). + expectation=_AbstractStub("expect_column_pair_cramers_phi_value_to_be_less_than"), + data=_PAIR_DATA, + ), + ExpectationCase( + id="expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than", + # Not yet migrated; __init__ raises NotImplementedError. + expectation=_AbstractStub( + "expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than" + ), + data=_NUMERIC_DATA, + ), + ExpectationCase( + id="expect_column_proportion_of_non_null_values_to_be_between", + expectation=gxe.ExpectColumnProportionOfNonNullValuesToBeBetween( + column="col_a", min_value=0.0, max_value=1.0 + ), + data=_DEFAULT_DATA, + ), + ExpectationCase( + id="expect_column_proportion_of_unique_values_to_be_between", + expectation=gxe.ExpectColumnProportionOfUniqueValuesToBeBetween( + column="col_a", min_value=0.0, max_value=1.0 + ), + data=_DEFAULT_DATA, + ), + ExpectationCase( + id="expect_column_quantile_values_to_be_between", + expectation=gxe.ExpectColumnQuantileValuesToBeBetween( + column="col_c", + quantile_ranges={ + "quantiles": [0.25, 0.5, 0.75], + "value_ranges": [[0, 3], [1, 4], [2, 6]], + }, + ), + data=_DEFAULT_DATA, + ), + ExpectationCase( + id="expect_column_stdev_to_be_between", + expectation=gxe.ExpectColumnStdevToBeBetween(column="col_a", min_value=0, max_value=10), + data=_NUMERIC_DATA, + ), + ExpectationCase( + id="expect_column_sum_to_be_between", + expectation=gxe.ExpectColumnSumToBeBetween(column="col_a", min_value=0, max_value=100), + data=_DEFAULT_DATA, + ), + ExpectationCase( + id="expect_column_to_exist", + expectation=gxe.ExpectColumnToExist(column="col_a"), + data=_DEFAULT_DATA, + ), + ExpectationCase( + id="expect_column_unique_value_count_to_be_between", + expectation=gxe.ExpectColumnUniqueValueCountToBeBetween( + column="col_a", min_value=1, max_value=10 + ), + data=_DEFAULT_DATA, + ), + # ------------------------------------------------------------------ + # AGGREGATE — TableExpectation / BatchExpectation + # ------------------------------------------------------------------ + ExpectationCase( + id="expect_query_results_to_match_comparison", + expectation=gxe.ExpectQueryResultsToMatchComparison( + base_query="SELECT 1 AS val", + comparison_data_source_name="other_ds", + comparison_query="SELECT 1 AS val", + ), + data=_DEFAULT_DATA, + ), + ExpectationCase( + id="expect_table_column_count_to_be_between", + expectation=gxe.ExpectTableColumnCountToBeBetween(min_value=1, max_value=10), + data=_DEFAULT_DATA, + ), + ExpectationCase( + id="expect_table_column_count_to_equal", + expectation=gxe.ExpectTableColumnCountToEqual(value=3), + data=_DEFAULT_DATA, + ), + ExpectationCase( + id="expect_table_columns_to_match_ordered_list", + expectation=gxe.ExpectTableColumnsToMatchOrderedList( + column_list=["col_a", "col_b", "col_c"] + ), + data=_DEFAULT_DATA, + ), + ExpectationCase( + id="expect_table_columns_to_match_set", + expectation=gxe.ExpectTableColumnsToMatchSet(column_set=["col_a", "col_b", "col_c"]), + data=_DEFAULT_DATA, + ), + ExpectationCase( + id="expect_table_row_count_to_be_between", + expectation=gxe.ExpectTableRowCountToBeBetween(min_value=1, max_value=100), + data=_DEFAULT_DATA, + ), + ExpectationCase( + id="expect_table_row_count_to_equal", + expectation=gxe.ExpectTableRowCountToEqual(value=5), + data=_DEFAULT_DATA, + ), + ExpectationCase( + id="expect_table_row_count_to_equal_other_table", + expectation=gxe.ExpectTableRowCountToEqualOtherTable(other_table_name="other_table"), + data=_DEFAULT_DATA, + ), +] diff --git a/tests/integration/data_sources_and_expectations/expectations/_validation_result_schemas_helpers.py b/tests/integration/data_sources_and_expectations/expectations/_validation_result_schemas_helpers.py new file mode 100644 index 000000000000..84b1f5aab02c --- /dev/null +++ b/tests/integration/data_sources_and_expectations/expectations/_validation_result_schemas_helpers.py @@ -0,0 +1,75 @@ +"""Matrix runner helpers for validation result schema tests. + +Underscore-prefixed so pytest does not collect this file. + +These helpers are imported by the matrix runner and its unit tests. +They are intentionally free of test framework dependencies so they can +be used in both pytest fixtures and standalone scripts. +""" + +from __future__ import annotations + +from great_expectations.core.validation_result_schemas.field_validators import ( + classify_runtime_type, +) + +# --------------------------------------------------------------------------- +# SQL dialect normalisation table (from design.md) +# --------------------------------------------------------------------------- + +_SQL_DIALECTS = frozenset( + { + "sql", + "snowflake", + "postgres", + "redshift", + "databricks_sql", + "sqlite", + "bigquery", + "mysql", + "mssql", + } +) + + +def _normalize_engine_hint(datasource_type: str) -> str: + """Collapse SQL dialects to 'sql'; pass through 'pandas' and 'spark'. + + Unknown types are returned as-is. + """ + if datasource_type == "pandas": + return "pandas" + if datasource_type in ("spark", "dataframe"): + return "spark" + if datasource_type in _SQL_DIALECTS: + return "sql" + # Fallback: return as-is for unknown types + return datasource_type + + +def assert_field_set_covered(raw_result_dict: dict, parsed_model) -> None: + """Assert every key in raw_result_dict is reachable in parsed_model.dict(). + + The parsed model may have extra fields (like engine_hint) not in the raw + dict — that is fine. The reverse is NOT fine: raw dict keys that are + absent from the model indicate information loss. + + Raises AssertionError with the offending key(s) if any raw key is missing + from the model's dict() output. + """ + model_dict = parsed_model.dict() + missing = [k for k in raw_result_dict if k not in model_dict] + assert not missing, f"Fields in raw_result_dict not covered by parsed model: {missing}" + + +def summarize_raw_dict(raw: dict) -> dict: + """Extract structure (field names and types) from a result dict, never values. + + Returns a dict with keys: + - raw_field_set: sorted list of field names + - raw_field_types: {field_name: RuntimeTypeName.value} + """ + return { + "raw_field_set": sorted(raw.keys()), + "raw_field_types": {k: classify_runtime_type(v).value for k, v in raw.items()}, + } diff --git a/tests/integration/data_sources_and_expectations/expectations/test_validation_result_schemas_matrix.py b/tests/integration/data_sources_and_expectations/expectations/test_validation_result_schemas_matrix.py new file mode 100644 index 000000000000..143aabf52592 --- /dev/null +++ b/tests/integration/data_sources_and_expectations/expectations/test_validation_result_schemas_matrix.py @@ -0,0 +1,244 @@ +"""Matrix runner for validation result schema coverage. + +Runs every (expectation x result_format x data_source) combination and writes +a structured findings JSON file. Expanded to ALL_DATA_SOURCES (task 8.1). + +Abstract stubs (5 expectations whose ``__init__`` raises ``NotImplementedError``) +cannot be validated; they produce ``status=failed`` findings and the corresponding +test cells are marked as failures — this is expected and documented here. + +Findings file location (relative to the worktree root): + tests/_artifacts/validation_result_schemas/findings/.json + +xdist note: this module uses a session-scoped FindingsWriter; parallelising +within a single session would cause concurrent writes to the same JSON file. +The ``no_xdist`` marker documents this constraint. CI uses ``--dist loadfile`` +which naturally routes all cells from this file to a single worker, so the +constraint is satisfied without extra conftest machinery. +""" + +from __future__ import annotations + +import datetime +import random +import string +from typing import TYPE_CHECKING + +import pandas as pd +import pytest + +from great_expectations.core.result_format import ResultFormat +from great_expectations.core.validation_result_schemas.dispatcher import as_typed +from great_expectations.core.validation_result_schemas.findings_emitter import ( + FindingsWriter, +) +from great_expectations.core.validation_result_schemas.types import Status +from tests.integration.conftest import parameterize_batch_for_data_sources +from tests.integration.data_sources_and_expectations.expectations._validation_result_schemas_cases import ( # noqa: E501 + EXPECTATION_CASES, + ExpectationCase, + _AbstractStub, +) +from tests.integration.data_sources_and_expectations.expectations._validation_result_schemas_helpers import ( # noqa: E501 + _normalize_engine_hint, + assert_field_set_covered, + summarize_raw_dict, +) +from tests.integration.data_sources_and_expectations.test_canonical_expectations import ( + ALL_DATA_SOURCES, +) + +if TYPE_CHECKING: + from great_expectations.datasource.fluent.interfaces import Batch + +# --------------------------------------------------------------------------- +# Module-level marker: session-scoped FindingsWriter must not be split across +# xdist workers. CI uses --dist loadfile which enforces this automatically. +# --------------------------------------------------------------------------- +pytestmark = [pytest.mark.no_xdist] + +# --------------------------------------------------------------------------- +# Shared fixture data — a superset DataFrame whose columns cover all cases. +# +# Per-case data-shape variance resolution (task 8.1): +# All EXPECTATION_CASES reference columns that exist in this DataFrame. +# Cases needing specific data shapes (dates, JSON strings, pure numerics) +# will run against this data; the expectation may fail validation (e.g. +# ExpectColumnValuesToBeDateutilParseable against integers), but that is +# fine — we are testing schema *parsing* of whatever result dict comes back, +# not expectation correctness. SQL backends that cannot operate on a +# VARCHAR column for sum/numeric expectations will produce a batch.validate() +# error which is caught, recorded as status=failed, and surfaced to the +# curator exactly as designed. +# --------------------------------------------------------------------------- +_MATRIX_DATA = pd.DataFrame( + { + "col_a": [1, 2, 3, None, 5], + "col_b": ["x", "y", "z", "w", None], + "col_c": [1.0, 2.0, None, 4.0, 5.0], + } +) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _generate_run_id() -> str: + """Generate a time-stamped run ID when ``--vrs-run-id`` is not supplied.""" + ts = datetime.datetime.now(datetime.timezone.utc).strftime("%Y-%m-%dT%H-%M-%SZ") + suffix = "".join(random.choices(string.ascii_lowercase + string.digits, k=6)) + return f"{ts}-{suffix}" + + +def _datasource_test_id(batch: Batch) -> str: + """Return a stable identifier for the data source under test.""" + return type(batch.datasource).__name__ + + +# --------------------------------------------------------------------------- +# Session-scoped findings writer fixture +# --------------------------------------------------------------------------- + + +@pytest.fixture(scope="session") +def _findings_writer(request: pytest.FixtureRequest) -> FindingsWriter: # type: ignore[return] + """Session-scoped FindingsWriter; yields writer, flushes on session teardown.""" + run_id: str = request.config.getoption("--vrs-run-id") or _generate_run_id() + with FindingsWriter(run_id=run_id) as writer: + yield writer + + +# --------------------------------------------------------------------------- +# Matrix test +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("case", EXPECTATION_CASES, ids=lambda c: c.id) +@pytest.mark.parametrize("result_format", list(ResultFormat)) +@parameterize_batch_for_data_sources( + data_source_configs=ALL_DATA_SOURCES, + data=_MATRIX_DATA, +) +def test_validation_result_schema_matrix( + batch_for_datasource: Batch, + case: ExpectationCase, + result_format: ResultFormat, + _findings_writer: FindingsWriter, +) -> None: + """Matrix runner: validate every (expectation x result_format x data_source) cell. + + Abstract-stub expectations (5 total) cannot be instantiated; they produce + ``status=failed`` findings. All other expectations should produce + ``status=parsed`` findings. + """ + engine_hint = _normalize_engine_hint(batch_for_datasource.datasource.type) + datasource_test_id = _datasource_test_id(batch_for_datasource) + + # ------------------------------------------------------------------ + # Guard: abstract stubs cannot be validated — record failure immediately + # ------------------------------------------------------------------ + if isinstance(case.expectation, _AbstractStub): + _findings_writer.write_finding( + { + "expectation_type": case.expectation.expectation_type, + "result_format": result_format.value, + "engine": engine_hint, + "datasource_test_id": datasource_test_id, + "status": Status.FAILED.value, + "error_summary": "AbstractStub: expectation not yet implemented", + } + ) + pytest.skip(f"[{case.id}][{result_format.value}][{engine_hint}]: abstract stub — skipped") + + expectation_type: str = case.expectation.expectation_type # type: ignore[union-attr] + + try: + raw_evr = batch_for_datasource.validate( + case.expectation, + result_format=result_format, # type: ignore[arg-type] + ) + except Exception as exc: + _findings_writer.write_finding( + { + "expectation_type": expectation_type, + "result_format": result_format.value, + "engine": engine_hint, + "datasource_test_id": datasource_test_id, + "status": Status.FAILED.value, + "error_summary": f"batch.validate raised: {type(exc).__name__}: {exc}", + } + ) + pytest.fail( + f"[{case.id}][{result_format.value}][{engine_hint}]: " + f"batch.validate raised {type(exc).__name__}: {exc}" + ) + + raw_result: dict = raw_evr.result or {} + + try: + # Call as_typed() via the dispatcher directly so we pass the exact result_format + # that was used for the validate() call. raw_evr.as_typed() reads result_format + # from expectation_config.kwargs which may default to SUMMARY instead of the + # result_format we actually exercised. + typed = as_typed( + raw_result, + expectation_type=expectation_type, + result_format=result_format, + engine_hint=engine_hint, + ) + except Exception as exc: + _findings_writer.write_finding( + { + "expectation_type": expectation_type, + "result_format": result_format.value, + "engine": engine_hint, + "datasource_test_id": datasource_test_id, + "status": Status.FAILED.value, + **summarize_raw_dict(raw_result), + "error_summary": f"as_typed raised: {type(exc).__name__}: {exc}", + } + ) + pytest.fail( + f"[{case.id}][{result_format.value}][{engine_hint}]: " + f"as_typed raised {type(exc).__name__}: {exc}" + ) + + # Coverage assertion: every raw key must appear in the parsed model + try: + assert_field_set_covered(raw_result, typed) + except AssertionError as exc: + _findings_writer.write_finding( + { + "expectation_type": expectation_type, + "result_format": result_format.value, + "engine": engine_hint, + "datasource_test_id": datasource_test_id, + "status": Status.FAILED.value, + **summarize_raw_dict(raw_result), + "matched_variant": type(typed).__name__, + "error_summary": str(exc), + } + ) + pytest.fail(f"[{case.id}][{result_format.value}][{engine_hint}]: {exc}") + + # Success path — record parsed finding + model_dict: dict = typed.dict() + schema_required = [k for k in raw_result if k in model_dict] + schema_optional = [k for k in model_dict if k not in raw_result] + + _findings_writer.write_finding( + { + "expectation_type": expectation_type, + "result_format": result_format.value, + "engine": engine_hint, + "datasource_test_id": datasource_test_id, + "status": Status.PARSED.value, + **summarize_raw_dict(raw_result), + "matched_variant": type(typed).__name__, + "schema_required_fields_present": schema_required, + "schema_optional_fields_present": schema_optional, + "schema_extras_rejected": [], + } + ) diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/unit/core/__init__.py b/tests/unit/core/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/unit/core/validation_result_schemas/__init__.py b/tests/unit/core/validation_result_schemas/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/unit/core/validation_result_schemas/test_as_typed.py b/tests/unit/core/validation_result_schemas/test_as_typed.py new file mode 100644 index 000000000000..b1a570869c89 --- /dev/null +++ b/tests/unit/core/validation_result_schemas/test_as_typed.py @@ -0,0 +1,382 @@ +"""Unit tests for ExpectationValidationResult.as_typed(). + +Covers requirements 2.1-2.6 and 6.1-6.5: +- Returns the correct typed model for map/aggregate expectations. +- Does not mutate the EVR in any way. +- EVR equality is preserved before and after calling as_typed(). +- No new attributes appear in vars(evr) after the call. +- Missing expectation_config falls back to expectation_type='unknown' (aggregate family). +- result_format can be specified as a string, enum, or dict-with-result_format. + +All tests are marked @pytest.mark.unit and run via: + pytest tests/unit/core/validation_result_schemas/test_as_typed.py -m unit -v +""" + +from __future__ import annotations + +import json +from typing import Optional + +import pytest + +from great_expectations.core.expectation_validation_result import ( + ExpectationValidationResult, +) +from great_expectations.core.validation_result_schemas.schemas.aggregate_result import ( + AggregateBasicResult, + AggregateBooleanOnlyResult, + AggregateCompleteResult, + AggregateSummaryResult, +) +from great_expectations.core.validation_result_schemas.schemas.map_result import ( + MapBasicResult, + MapBooleanOnlyResult, + MapCompleteResult, + MapSummaryResult, +) +from great_expectations.expectations.expectation_configuration import ( + ExpectationConfiguration, +) + +# --------------------------------------------------------------------------- +# Fixture helpers +# --------------------------------------------------------------------------- + +MAP_BASIC_RESULT = { + "element_count": 100, + "unexpected_count": 0, + "unexpected_percent": 0.0, + "missing_count": 0, + "missing_percent": 0.0, + "unexpected_percent_total": 0.0, + "unexpected_percent_nonmissing": 0.0, + "partial_unexpected_list": [], +} + +MAP_SUMMARY_RESULT = { + **MAP_BASIC_RESULT, + "partial_unexpected_counts": [], + "partial_unexpected_index_list": [], +} + +MAP_COMPLETE_RESULT = { + **MAP_SUMMARY_RESULT, + "unexpected_list": [], + "unexpected_index_list": [], +} + +AGG_BASIC_RESULT = { + "observed_value": 42.0, +} + +AGG_SUMMARY_RESULT = { + "observed_value": 42.0, +} + +AGG_COMPLETE_RESULT = { + "observed_value": 42.0, + "unexpected_list": None, + "unexpected_index_list": None, +} + + +def build_map_evr( + result_format: str = "BASIC", result: Optional[dict] = None +) -> ExpectationValidationResult: + """Build a map-family EVR (expect_column_values_to_not_be_null).""" + config = ExpectationConfiguration( + type="expect_column_values_to_not_be_null", + kwargs={"column": "col_a", "result_format": result_format}, + ) + return ExpectationValidationResult( + success=True, + expectation_config=config, + result=result if result is not None else dict(MAP_BASIC_RESULT), + ) + + +def build_agg_evr(result_format: str = "BASIC") -> ExpectationValidationResult: + """Build an aggregate-family EVR (expect_column_mean_to_be_between).""" + config = ExpectationConfiguration( + type="expect_column_mean_to_be_between", + kwargs={"column": "col_a", "min_value": 0, "result_format": result_format}, + ) + return ExpectationValidationResult( + success=True, + expectation_config=config, + result=dict(AGG_BASIC_RESULT), + ) + + +# --------------------------------------------------------------------------- +# Return type checks — map family +# --------------------------------------------------------------------------- + + +class TestMapFamilyReturnTypes: + """as_typed returns the correct map-family model class for each ResultFormat.""" + + @pytest.mark.unit + def test_map_boolean_only(self): + config = ExpectationConfiguration( + type="expect_column_values_to_not_be_null", + kwargs={"column": "col_a", "result_format": "BOOLEAN_ONLY"}, + ) + evr = ExpectationValidationResult( + success=True, + expectation_config=config, + result={}, + ) + typed = evr.as_typed() + assert isinstance(typed, MapBooleanOnlyResult) + + @pytest.mark.unit + def test_map_basic(self): + evr = build_map_evr(result_format="BASIC", result=dict(MAP_BASIC_RESULT)) + typed = evr.as_typed() + assert isinstance(typed, MapBasicResult) + + @pytest.mark.unit + def test_map_summary(self): + evr = build_map_evr(result_format="SUMMARY", result=dict(MAP_SUMMARY_RESULT)) + typed = evr.as_typed() + assert isinstance(typed, MapSummaryResult) + + @pytest.mark.unit + def test_map_complete(self): + evr = build_map_evr(result_format="COMPLETE", result=dict(MAP_COMPLETE_RESULT)) + typed = evr.as_typed() + assert isinstance(typed, MapCompleteResult) + + +# --------------------------------------------------------------------------- +# Return type checks — aggregate family +# --------------------------------------------------------------------------- + + +class TestAggregateFamilyReturnTypes: + """as_typed returns the correct aggregate-family model class for each ResultFormat.""" + + @pytest.mark.unit + def test_aggregate_boolean_only(self): + config = ExpectationConfiguration( + type="expect_column_mean_to_be_between", + kwargs={"column": "col_a", "result_format": "BOOLEAN_ONLY"}, + ) + evr = ExpectationValidationResult( + success=True, + expectation_config=config, + result={}, + ) + typed = evr.as_typed() + assert isinstance(typed, AggregateBooleanOnlyResult) + + @pytest.mark.unit + def test_aggregate_basic(self): + evr = build_agg_evr(result_format="BASIC") + typed = evr.as_typed() + assert isinstance(typed, AggregateBasicResult) + + @pytest.mark.unit + def test_aggregate_summary(self): + evr = build_agg_evr(result_format="SUMMARY") + typed = evr.as_typed() + assert isinstance(typed, AggregateSummaryResult) + + @pytest.mark.unit + def test_aggregate_complete(self): + config = ExpectationConfiguration( + type="expect_column_mean_to_be_between", + kwargs={"column": "col_a", "result_format": "COMPLETE"}, + ) + evr = ExpectationValidationResult( + success=True, + expectation_config=config, + result=dict(AGG_COMPLETE_RESULT), + ) + typed = evr.as_typed() + assert isinstance(typed, AggregateCompleteResult) + + +# --------------------------------------------------------------------------- +# No mutation +# --------------------------------------------------------------------------- + + +class TestNoMutation: + """as_typed must not mutate self in any way.""" + + @pytest.mark.unit + def test_result_dict_not_mutated(self): + evr = build_map_evr(result_format="BASIC", result=dict(MAP_BASIC_RESULT)) + before_result = dict(evr.result) + _ = evr.as_typed() + assert dict(evr.result) == before_result + + @pytest.mark.unit + def test_to_json_dict_identical_after_call(self): + evr = build_map_evr(result_format="BASIC", result=dict(MAP_BASIC_RESULT)) + before_dict = json.dumps(evr.to_json_dict(), sort_keys=True) + _ = evr.as_typed() + assert json.dumps(evr.to_json_dict(), sort_keys=True) == before_dict + + @pytest.mark.unit + def test_no_new_attributes(self): + evr = build_map_evr(result_format="BASIC", result=dict(MAP_BASIC_RESULT)) + before_vars = set(vars(evr).keys()) + _ = evr.as_typed() + assert set(vars(evr).keys()) == before_vars + + +# --------------------------------------------------------------------------- +# EVR equality preserved +# --------------------------------------------------------------------------- + + +class TestEqualityPreserved: + """as_typed must not affect EVR equality.""" + + @pytest.mark.unit + def test_equality_before_and_after_as_typed(self): + evr1 = build_map_evr(result_format="BASIC", result=dict(MAP_BASIC_RESULT)) + evr2 = build_map_evr(result_format="BASIC", result=dict(MAP_BASIC_RESULT)) + assert evr1 == evr2 + _ = evr1.as_typed() + assert evr1 == evr2 + + @pytest.mark.unit + def test_to_json_dict_byte_identical_pair(self): + evr1 = build_map_evr(result_format="BASIC", result=dict(MAP_BASIC_RESULT)) + evr2 = build_map_evr(result_format="BASIC", result=dict(MAP_BASIC_RESULT)) + assert json.dumps(evr1.to_json_dict(), sort_keys=True) == json.dumps( + evr2.to_json_dict(), sort_keys=True + ) + _ = evr1.as_typed() + assert json.dumps(evr1.to_json_dict(), sort_keys=True) == json.dumps( + evr2.to_json_dict(), sort_keys=True + ) + + +# --------------------------------------------------------------------------- +# Missing expectation_config fallback +# --------------------------------------------------------------------------- + + +class TestMissingConfigFallback: + """When expectation_config is None, expectation_type defaults to 'unknown'.""" + + @pytest.mark.unit + def test_none_config_routes_to_aggregate_fallback(self): + """'unknown' is not in the family table → falls back to 'aggregate' family.""" + evr = ExpectationValidationResult( + success=True, + expectation_config=None, + result={}, + ) + # family_for('unknown') returns 'aggregate' (fallback) + # result_format defaults to DEFAULT_RESULT_FORMAT (SUMMARY) + # AggregateSummaryResult is the expected class for aggregate + SUMMARY + typed = evr.as_typed() + assert isinstance(typed, AggregateSummaryResult) + + @pytest.mark.unit + def test_none_config_no_mutation(self): + evr = ExpectationValidationResult( + success=True, + expectation_config=None, + result={}, + ) + before_vars = set(vars(evr).keys()) + _ = evr.as_typed() + assert set(vars(evr).keys()) == before_vars + + +# --------------------------------------------------------------------------- +# result_format normalization: string, enum, dict-with-result_format +# --------------------------------------------------------------------------- + + +class TestResultFormatNormalization: + """result_format from kwargs is normalized from string, enum, or dict shapes.""" + + @pytest.mark.unit + def test_string_result_format(self): + """result_format stored as plain string in kwargs.""" + evr = build_map_evr(result_format="BASIC", result=dict(MAP_BASIC_RESULT)) + typed = evr.as_typed() + assert isinstance(typed, MapBasicResult) + + @pytest.mark.unit + def test_enum_result_format(self): + """result_format stored as ResultFormat enum in kwargs.""" + from great_expectations.core.result_format import ResultFormat + + config = ExpectationConfiguration( + type="expect_column_values_to_not_be_null", + kwargs={"column": "col_a", "result_format": ResultFormat.BASIC}, + ) + evr = ExpectationValidationResult( + success=True, + expectation_config=config, + result=dict(MAP_BASIC_RESULT), + ) + typed = evr.as_typed() + assert isinstance(typed, MapBasicResult) + + @pytest.mark.unit + def test_dict_result_format(self): + """result_format stored as dict with 'result_format' key in kwargs.""" + config = ExpectationConfiguration( + type="expect_column_values_to_not_be_null", + kwargs={ + "column": "col_a", + "result_format": {"result_format": "BASIC", "partial_unexpected_count": 20}, + }, + ) + evr = ExpectationValidationResult( + success=True, + expectation_config=config, + result=dict(MAP_BASIC_RESULT), + ) + typed = evr.as_typed() + assert isinstance(typed, MapBasicResult) + + @pytest.mark.unit + def test_missing_result_format_defaults_to_summary(self): + """When result_format is absent from kwargs, DEFAULT_RESULT_FORMAT (SUMMARY) is used.""" + config = ExpectationConfiguration( + type="expect_column_values_to_not_be_null", + kwargs={"column": "col_a"}, # no result_format + ) + evr = ExpectationValidationResult( + success=True, + expectation_config=config, + result=dict(MAP_SUMMARY_RESULT), + ) + typed = evr.as_typed() + # DEFAULT_RESULT_FORMAT is SUMMARY → MapSummaryResult + assert isinstance(typed, MapSummaryResult) + + +# --------------------------------------------------------------------------- +# engine_hint passthrough +# --------------------------------------------------------------------------- + + +class TestEngineHintPassthrough: + """engine_hint is forwarded to the dispatcher without mutating the EVR.""" + + @pytest.mark.unit + def test_engine_hint_pandas_map_basic(self): + evr = build_map_evr(result_format="BASIC", result=dict(MAP_BASIC_RESULT)) + typed = evr.as_typed(engine_hint="pandas") + assert isinstance(typed, MapBasicResult) + + @pytest.mark.unit + def test_engine_hint_does_not_mutate_evr(self): + evr = build_map_evr(result_format="BASIC", result=dict(MAP_BASIC_RESULT)) + before_vars = set(vars(evr).keys()) + before_result = dict(evr.result) + _ = evr.as_typed(engine_hint="pandas") + assert set(vars(evr).keys()) == before_vars + assert dict(evr.result) == before_result diff --git a/tests/unit/core/validation_result_schemas/test_cases_table.py b/tests/unit/core/validation_result_schemas/test_cases_table.py new file mode 100644 index 000000000000..901edf9cda3b --- /dev/null +++ b/tests/unit/core/validation_result_schemas/test_cases_table.py @@ -0,0 +1,53 @@ +"""Unit tests for the EXPECTATION_CASES table. + +Verifies: + (a) every ``id`` in the table is unique + (b) every expectation_type covered by ``family_for`` returns 'map' or 'aggregate' + (c) the case count equals the number of expect_*.py files under core/ +""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from great_expectations.core.validation_result_schemas.dispatcher import ( + family_for, +) +from tests.integration.data_sources_and_expectations.expectations._validation_result_schemas_cases import ( # noqa: E501 + EXPECTATION_CASES, +) + + +@pytest.mark.unit +def test_case_ids_are_unique() -> None: + ids = [c.id for c in EXPECTATION_CASES] + assert len(ids) == len(set(ids)), f"Duplicate ids: {sorted(i for i in ids if ids.count(i) > 1)}" + + +@pytest.mark.unit +def test_all_expectation_types_in_family_table() -> None: + for case in EXPECTATION_CASES: + exp_type = case.expectation.expectation_type + family = family_for(exp_type) + assert family in ("map", "aggregate"), f"{exp_type!r} returned unexpected family {family!r}" + + +@pytest.mark.unit +def test_case_count_matches_core_expectations() -> None: + core_dir = ( + Path(__file__).parent + / ".." + / ".." + / ".." + / ".." + / "great_expectations" + / "expectations" + / "core" + ) + core_files = list(core_dir.glob("expect_*.py")) + expected_count = len([f for f in core_files if not f.name.startswith("__")]) + assert len(EXPECTATION_CASES) == expected_count, ( + f"Expected {expected_count} cases, got {len(EXPECTATION_CASES)}" + ) diff --git a/tests/unit/core/validation_result_schemas/test_dispatcher.py b/tests/unit/core/validation_result_schemas/test_dispatcher.py new file mode 100644 index 000000000000..655f8ec8f1aa --- /dev/null +++ b/tests/unit/core/validation_result_schemas/test_dispatcher.py @@ -0,0 +1,387 @@ +"""Unit tests for the dispatcher module. + +Covers: +- Synthetic input per (family, format) cell — all 8 combinations. +- Unknown expectation type falls back to 'aggregate'. +- SQL sniffing: engine_hint=None + unexpected_index_query in result_dict → eff_engine='sql'. +- Per-expectation override route (expect_column_values_to_be_of_type on sql/spark). +- ParseError raised with a diagnostic message on bad input. +- test_family_table_covers_core_expectations: every expect_*.py in expectations/core/ is present. + +All tests are marked @pytest.mark.unit and run via: + pytest tests/unit/core/validation_result_schemas/test_dispatcher.py -m unit -v +""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +from great_expectations.core.result_format import ResultFormat +from great_expectations.core.validation_result_schemas.dispatcher import ( + _FAMILY_TABLE, + ParseError, + as_typed, + family_for, +) +from great_expectations.core.validation_result_schemas.schemas.aggregate_result import ( + AggregateBasicResult, + AggregateBooleanOnlyResult, + AggregateCompleteResult, + AggregateSummaryResult, +) +from great_expectations.core.validation_result_schemas.schemas.map_result import ( + MapBasicResult, + MapBooleanOnlyResult, + MapCompleteResult, + MapSummaryResult, +) +from great_expectations.core.validation_result_schemas.schemas.per_expectation_overrides import ( + ExpectColumnValuesToBeOfTypeSqlSparkResult, +) + +# --------------------------------------------------------------------------- +# A canonical map expectation and aggregate expectation used across tests +# --------------------------------------------------------------------------- + +MAP_EXPECTATION = "expect_column_values_to_be_between" +AGG_EXPECTATION = "expect_column_mean_to_be_between" + +# --------------------------------------------------------------------------- +# Minimal valid result dicts per family x format +# --------------------------------------------------------------------------- + +MAP_BOOLEAN_ONLY_DICT: dict = {} +MAP_BASIC_DICT: dict = { + "element_count": 100, + "unexpected_count": 5, + "unexpected_percent": 5.0, + "missing_count": 0, + "missing_percent": 0.0, + "unexpected_percent_total": 5.0, + "unexpected_percent_nonmissing": 5.0, + "partial_unexpected_list": [1, 2, 3], +} +MAP_SUMMARY_DICT: dict = { + **MAP_BASIC_DICT, + "partial_unexpected_counts": [{"value": 1, "count": 2}], + "partial_unexpected_index_list": [0, 1], +} +MAP_COMPLETE_DICT: dict = { + **MAP_SUMMARY_DICT, + "unexpected_list": [1, 2, 3, 4, 5], + "unexpected_index_list": [0, 1, 2, 3, 4], +} + +AGG_BOOLEAN_ONLY_DICT: dict = {} +AGG_BASIC_DICT: dict = {"observed_value": 42.0} +AGG_SUMMARY_DICT: dict = {"observed_value": 42.0} +AGG_COMPLETE_DICT: dict = { + "observed_value": 42.0, + "unexpected_list": None, + "unexpected_index_list": None, +} + + +# --------------------------------------------------------------------------- +# (family, format) matrix — 8 cells +# --------------------------------------------------------------------------- + + +class TestFamilyFormatMatrix: + """as_typed returns the correct model class for every (family, format) cell.""" + + @pytest.mark.unit + def test_map_boolean_only(self): + result = as_typed( + MAP_BOOLEAN_ONLY_DICT, + expectation_type=MAP_EXPECTATION, + result_format=ResultFormat.BOOLEAN_ONLY, + ) + assert isinstance(result, MapBooleanOnlyResult) + + @pytest.mark.unit + def test_map_basic(self): + result = as_typed( + MAP_BASIC_DICT, + expectation_type=MAP_EXPECTATION, + result_format=ResultFormat.BASIC, + ) + assert isinstance(result, MapBasicResult) + assert result.element_count == 100 + assert result.unexpected_count == 5 + + @pytest.mark.unit + def test_map_summary(self): + result = as_typed( + MAP_SUMMARY_DICT, + expectation_type=MAP_EXPECTATION, + result_format=ResultFormat.SUMMARY, + ) + assert isinstance(result, MapSummaryResult) + assert result.partial_unexpected_index_list == [0, 1] + + @pytest.mark.unit + def test_map_complete(self): + result = as_typed( + MAP_COMPLETE_DICT, + expectation_type=MAP_EXPECTATION, + result_format=ResultFormat.COMPLETE, + ) + assert isinstance(result, MapCompleteResult) + assert result.unexpected_list == [1, 2, 3, 4, 5] + + @pytest.mark.unit + def test_aggregate_boolean_only(self): + result = as_typed( + AGG_BOOLEAN_ONLY_DICT, + expectation_type=AGG_EXPECTATION, + result_format=ResultFormat.BOOLEAN_ONLY, + ) + assert isinstance(result, AggregateBooleanOnlyResult) + + @pytest.mark.unit + def test_aggregate_basic(self): + result = as_typed( + AGG_BASIC_DICT, + expectation_type=AGG_EXPECTATION, + result_format=ResultFormat.BASIC, + ) + assert isinstance(result, AggregateBasicResult) + assert result.observed_value == 42.0 + + @pytest.mark.unit + def test_aggregate_summary(self): + result = as_typed( + AGG_SUMMARY_DICT, + expectation_type=AGG_EXPECTATION, + result_format=ResultFormat.SUMMARY, + ) + assert isinstance(result, AggregateSummaryResult) + + @pytest.mark.unit + def test_aggregate_complete(self): + result = as_typed( + AGG_COMPLETE_DICT, + expectation_type=AGG_EXPECTATION, + result_format=ResultFormat.COMPLETE, + ) + assert isinstance(result, AggregateCompleteResult) + + +# --------------------------------------------------------------------------- +# family_for — unknown type falls back to 'aggregate' +# --------------------------------------------------------------------------- + + +class TestFamilyFor: + @pytest.mark.unit + def test_known_map_type(self): + assert family_for("expect_column_values_to_be_between") == "map" + + @pytest.mark.unit + def test_known_aggregate_type(self): + assert family_for("expect_column_mean_to_be_between") == "aggregate" + + @pytest.mark.unit + def test_unknown_type_falls_back_to_aggregate(self): + assert family_for("expect_some_custom_unknown_expectation") == "aggregate" + + @pytest.mark.unit + def test_unknown_type_dispatches_to_aggregate_class(self): + """as_typed uses 'aggregate' family for unknown expectation types.""" + result = as_typed( + AGG_BASIC_DICT, + expectation_type="expect_some_custom_unknown_expectation", + result_format=ResultFormat.BASIC, + ) + assert isinstance(result, AggregateBasicResult) + + +# --------------------------------------------------------------------------- +# SQL sniffing +# --------------------------------------------------------------------------- + + +class TestSqlSniffing: + @pytest.mark.unit + def test_sql_sniff_sets_engine_via_unexpected_index_query(self): + """When engine_hint is None but unexpected_index_query is in result_dict, + eff_engine is sniffed as 'sql' and engine_hint is propagated to the model.""" + result_dict = { + **MAP_COMPLETE_DICT, + "unexpected_index_query": "SELECT * FROM table WHERE x < 0", + } + result = as_typed( + result_dict, + expectation_type=MAP_EXPECTATION, + result_format=ResultFormat.COMPLETE, + engine_hint=None, + ) + assert isinstance(result, MapCompleteResult) + assert result.unexpected_index_query == "SELECT * FROM table WHERE x < 0" + assert result.engine_hint == "sql" + + @pytest.mark.unit + def test_explicit_engine_hint_takes_precedence(self): + """When engine_hint is supplied, SQL sniffing is bypassed.""" + result_dict = { + **MAP_COMPLETE_DICT, + "unexpected_index_query": "SELECT * FROM table WHERE x < 0", + } + result = as_typed( + result_dict, + expectation_type=MAP_EXPECTATION, + result_format=ResultFormat.COMPLETE, + engine_hint="pandas", + ) + assert isinstance(result, MapCompleteResult) + assert result.engine_hint == "pandas" + + @pytest.mark.unit + def test_no_sniff_without_index_query(self): + """When result_dict has no unexpected_index_query and engine_hint is None, + engine_hint is not injected into the model.""" + result = as_typed( + MAP_COMPLETE_DICT, + expectation_type=MAP_EXPECTATION, + result_format=ResultFormat.COMPLETE, + engine_hint=None, + ) + assert isinstance(result, MapCompleteResult) + assert result.engine_hint is None + + +# --------------------------------------------------------------------------- +# Per-expectation override route +# --------------------------------------------------------------------------- + + +class TestPerExpectationOverride: + @pytest.mark.unit + def test_override_with_sql_engine_hint(self): + """expect_column_values_to_be_of_type + sql → ExpectColumnValuesToBeOfTypeSqlSparkResult.""" + result_dict = {"observed_value": "int64"} + result = as_typed( + result_dict, + expectation_type="expect_column_values_to_be_of_type", + result_format=ResultFormat.SUMMARY, + engine_hint="sql", + ) + assert isinstance(result, ExpectColumnValuesToBeOfTypeSqlSparkResult) + assert result.observed_value == "int64" + + @pytest.mark.unit + def test_override_with_spark_engine_hint(self): + """expect_column_values_to_be_of_type + spark → same override class.""" + result_dict = {"observed_value": "LongType"} + result = as_typed( + result_dict, + expectation_type="expect_column_values_to_be_of_type", + result_format=ResultFormat.COMPLETE, + engine_hint="spark", + ) + assert isinstance(result, ExpectColumnValuesToBeOfTypeSqlSparkResult) + assert result.observed_value == "LongType" + + @pytest.mark.unit + def test_override_sql_engine_hint_direct(self): + """Explicit engine_hint='sql' triggers the override (no sniffing needed).""" + result_dict = {"observed_value": "int64"} + result = as_typed( + result_dict, + expectation_type="expect_column_values_to_be_of_type", + result_format=ResultFormat.COMPLETE, + engine_hint="sql", + ) + assert isinstance(result, ExpectColumnValuesToBeOfTypeSqlSparkResult) + assert result.observed_value == "int64" + + @pytest.mark.unit + def test_no_override_without_engine_hint(self): + """Without sql/spark engine_hint, falls through to family dispatch (map).""" + result_dict = MAP_BASIC_DICT + result = as_typed( + result_dict, + expectation_type="expect_column_values_to_be_of_type", + result_format=ResultFormat.BASIC, + engine_hint=None, + ) + assert isinstance(result, MapBasicResult) + + +# --------------------------------------------------------------------------- +# ParseError — raised with diagnostic message +# --------------------------------------------------------------------------- + + +class TestParseError: + @pytest.mark.unit + def test_parse_error_raised_on_bad_dict(self): + """A result_dict with extra fields not accepted by the schema → ParseError.""" + bad_dict = {"totally_unknown_field": "bad_value", "another_bad": 999} + with pytest.raises(ParseError) as exc_info: + as_typed( + bad_dict, + expectation_type=MAP_EXPECTATION, + result_format=ResultFormat.BOOLEAN_ONLY, + ) + msg = str(exc_info.value) + assert "MapBooleanOnlyResult" in msg or "expect_column_values_to_be_between" in msg + + @pytest.mark.unit + def test_parse_error_raised_for_override_on_bad_dict(self): + """Override path raises ParseError when schema rejects extra/missing fields.""" + # ExpectColumnValuesToBeOfTypeSqlSparkResult has extra=forbid. + # An extra field not on the model will trigger validation error. + bad_dict = {"observed_value": "int64", "unexpected_extra_field": "boom"} + with pytest.raises(ParseError) as exc_info: + as_typed( + bad_dict, + expectation_type="expect_column_values_to_be_of_type", + result_format=ResultFormat.SUMMARY, + engine_hint="sql", + ) + msg = str(exc_info.value) + assert "expect_column_values_to_be_of_type" in msg + + @pytest.mark.unit + def test_parse_error_wraps_validation_error(self): + """ParseError.__cause__ is a pydantic.ValidationError.""" + from great_expectations.compatibility import pydantic + + bad_dict = {"bad_field": "unexpected"} + with pytest.raises(ParseError) as exc_info: + as_typed( + bad_dict, + expectation_type=AGG_EXPECTATION, + result_format=ResultFormat.BOOLEAN_ONLY, + ) + assert isinstance(exc_info.value.__cause__, pydantic.ValidationError) + + +# --------------------------------------------------------------------------- +# Coverage test — every expect_*.py in expectations/core/ must be in _FAMILY_TABLE +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_family_table_covers_core_expectations(): + """Every expect_*.py file in expectations/core/ must appear in _FAMILY_TABLE.""" + core_dir = ( + Path(__file__).parent + / ".." + / ".." + / ".." + / ".." + / "great_expectations" + / "expectations" + / "core" + ) + core_files = list(core_dir.glob("expect_*.py")) + expectation_names = { + f.name.replace(".py", "") for f in core_files if not f.name.startswith("__") + } + missing = expectation_names - set(_FAMILY_TABLE.keys()) + assert not missing, f"Missing from _FAMILY_TABLE: {sorted(missing)}" diff --git a/tests/unit/core/validation_result_schemas/test_field_validators.py b/tests/unit/core/validation_result_schemas/test_field_validators.py new file mode 100644 index 000000000000..8bab1ac360cc --- /dev/null +++ b/tests/unit/core/validation_result_schemas/test_field_validators.py @@ -0,0 +1,346 @@ +"""Unit tests for field_validators.py. + +Covers: +- classify_runtime_type for every declared RuntimeTypeName enum value +- validate_partial_unexpected_counts_fallback for both valid shapes +- root_validate_engine_required_fields for the skip-when-no-hint and + assert-when-sql-and-requested cases + +All tests are marked @pytest.mark.unit and run via: + pytest tests/unit/core/validation_result_schemas/test_field_validators.py -m unit +""" + +from __future__ import annotations + +from typing import Any, Dict, List, Optional + +import pytest + +from great_expectations.compatibility import pydantic +from great_expectations.core.validation_result_schemas.field_validators import ( + classify_runtime_type, + root_validate_engine_required_fields, + validate_partial_unexpected_counts_fallback, + validate_unexpected_rows_passthrough, +) +from great_expectations.core.validation_result_schemas.types import RuntimeTypeName + +# --------------------------------------------------------------------------- +# Helpers — minimal Pydantic v1 model for exercising validators +# --------------------------------------------------------------------------- + + +class _PartialCountsModel(pydantic.BaseModel): + """Minimal model to exercise validate_partial_unexpected_counts_fallback.""" + + partial_unexpected_counts: Optional[List[Any]] = None + + _validate_counts = pydantic.validator("partial_unexpected_counts", pre=True, allow_reuse=True)( + validate_partial_unexpected_counts_fallback + ) + + +class _PassthroughModel(pydantic.BaseModel): + """Minimal model to exercise validate_unexpected_rows_passthrough.""" + + unexpected_rows: Any = None + + _validate_rows = pydantic.validator("unexpected_rows", pre=True, allow_reuse=True)( + validate_unexpected_rows_passthrough + ) + + +class _EngineHintModel(pydantic.BaseModel): + """Minimal model to exercise root_validate_engine_required_fields. + + engine_hint is a regular pydantic field (no underscore prefix) so that it + appears in the values dict during root validation. In pydantic v1, fields + starting with ``_`` are silently excluded from ``__fields__`` and never + reach the root_validator — making the SQL check dead code. Using a plain + field name avoids that pitfall. + """ + + engine_hint: Optional[str] = None + return_unexpected_index_query: Optional[bool] = None + unexpected_index_query: Optional[str] = None + + _root_validate = pydantic.root_validator(allow_reuse=True)(root_validate_engine_required_fields) + + +# --------------------------------------------------------------------------- +# classify_runtime_type +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_classify_none() -> None: + assert classify_runtime_type(None) == RuntimeTypeName.NONE + + +@pytest.mark.unit +def test_classify_bool() -> None: + # bool must be checked before int since bool is a subclass of int + assert classify_runtime_type(True) == RuntimeTypeName.BOOL + assert classify_runtime_type(False) == RuntimeTypeName.BOOL + + +@pytest.mark.unit +def test_classify_int() -> None: + assert classify_runtime_type(0) == RuntimeTypeName.INT + assert classify_runtime_type(42) == RuntimeTypeName.INT + assert classify_runtime_type(-1) == RuntimeTypeName.INT + + +@pytest.mark.unit +def test_classify_float() -> None: + assert classify_runtime_type(3.14) == RuntimeTypeName.FLOAT + assert classify_runtime_type(0.0) == RuntimeTypeName.FLOAT + + +@pytest.mark.unit +def test_classify_str() -> None: + assert classify_runtime_type("hello") == RuntimeTypeName.STR + assert classify_runtime_type("") == RuntimeTypeName.STR + + +@pytest.mark.unit +def test_classify_list() -> None: + assert classify_runtime_type([]) == RuntimeTypeName.LIST + assert classify_runtime_type([1, 2, 3]) == RuntimeTypeName.LIST + + +@pytest.mark.unit +def test_classify_dict() -> None: + assert classify_runtime_type({}) == RuntimeTypeName.DICT + assert classify_runtime_type({"key": "value"}) == RuntimeTypeName.DICT + + +@pytest.mark.unit +def test_classify_pandas_dataframe() -> None: + """pandas DataFrame should return DATAFRAME_PANDAS without requiring pandas at import time.""" + pd = pytest.importorskip("pandas") + df = pd.DataFrame({"a": [1, 2, 3]}) + assert classify_runtime_type(df) == RuntimeTypeName.DATAFRAME_PANDAS + + +@pytest.mark.unit +def test_classify_spark_dataframe_other_when_pyspark_unavailable() -> None: + """When pyspark is unavailable, a mock object named DataFrame from pyspark should + be classified as DATAFRAME_SPARK if it looks like pyspark, or OTHER otherwise.""" + + # Without actual pyspark, we simulate the check using a mock + # The classifier should detect pyspark via module path inspection + class _FakeSparkDataFrame: + pass + + # Give it a pyspark-like module path + _FakeSparkDataFrame.__module__ = "pyspark.sql.dataframe" + _FakeSparkDataFrame.__name__ = "DataFrame" + + fake_spark_df = _FakeSparkDataFrame() + result = classify_runtime_type(fake_spark_df) + assert result == RuntimeTypeName.DATAFRAME_SPARK + + +@pytest.mark.unit +def test_classify_other_for_unknown_type() -> None: + class _CustomObject: + pass + + assert classify_runtime_type(_CustomObject()) == RuntimeTypeName.OTHER + assert classify_runtime_type(object()) == RuntimeTypeName.OTHER + + +@pytest.mark.unit +def test_classify_never_raises() -> None: + """classify_runtime_type must never raise regardless of input.""" + + # Includes edge cases: class instances, iterators, generators + class _WeirdObject: + def __class_getitem__(cls, item: Any) -> Any: + raise RuntimeError("should never be called") + + for value in [ + _WeirdObject(), + (1, 2, 3), # tuple -> OTHER + {1, 2, 3}, # set -> OTHER + lambda: None, # callable -> OTHER + ]: + result = classify_runtime_type(value) + assert isinstance(result, RuntimeTypeName), f"Expected RuntimeTypeName for {value!r}" + + +# --------------------------------------------------------------------------- +# validate_unexpected_rows_passthrough +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_passthrough_accepts_none() -> None: + m = _PassthroughModel(unexpected_rows=None) + assert m.unexpected_rows is None + + +@pytest.mark.unit +def test_passthrough_accepts_list() -> None: + rows = [{"a": 1}, {"a": 2}] + m = _PassthroughModel(unexpected_rows=rows) + assert m.unexpected_rows == rows + + +@pytest.mark.unit +def test_passthrough_accepts_dict() -> None: + m = _PassthroughModel(unexpected_rows={"a": 1}) + assert m.unexpected_rows == {"a": 1} + + +@pytest.mark.unit +def test_passthrough_returns_value_unchanged() -> None: + sentinel = object() + # Can't pass an arbitrary object through pydantic's JSON serialization, but + # we can verify the validator function directly + result = validate_unexpected_rows_passthrough(None, sentinel) + assert result is sentinel + + +# --------------------------------------------------------------------------- +# validate_partial_unexpected_counts_fallback +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_partial_counts_accepts_none() -> None: + m = _PartialCountsModel(partial_unexpected_counts=None) + assert m.partial_unexpected_counts is None + + +@pytest.mark.unit +def test_partial_counts_accepts_canonical_shape() -> None: + """Canonical shape: [{value: x, count: n}, ...]""" + counts = [{"value": "foo", "count": 3}, {"value": "bar", "count": 1}] + m = _PartialCountsModel(partial_unexpected_counts=counts) + assert m.partial_unexpected_counts == counts + + +@pytest.mark.unit +def test_partial_counts_accepts_error_fallback_shape() -> None: + """Error fallback shape: [{"error": "partial_exception_counts requires a hashable type"}]""" + fallback = [{"error": "partial_exception_counts requires a hashable type"}] + m = _PartialCountsModel(partial_unexpected_counts=fallback) + assert m.partial_unexpected_counts == fallback + + +@pytest.mark.unit +def test_partial_counts_accepts_empty_list() -> None: + m = _PartialCountsModel(partial_unexpected_counts=[]) + assert m.partial_unexpected_counts == [] + + +@pytest.mark.unit +def test_partial_counts_returns_value_unchanged() -> None: + counts = [{"value": "x", "count": 5}] + result = validate_partial_unexpected_counts_fallback(None, counts) + assert result == counts + + +@pytest.mark.unit +def test_partial_counts_none_returned_as_none() -> None: + result = validate_partial_unexpected_counts_fallback(None, None) + assert result is None + + +# --------------------------------------------------------------------------- +# root_validate_engine_required_fields +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_root_validate_no_hint_is_noop() -> None: + """When no engine hint is present, the validator is a no-op (no assertion).""" + # No engine_hint set; return_unexpected_index_query=True but no unexpected_index_query + # should NOT raise because there is no hint to trigger the SQL check + values: Dict[str, Any] = { + "return_unexpected_index_query": True, + "unexpected_index_query": None, + } + result = root_validate_engine_required_fields(None, values) + assert result == values + + +@pytest.mark.unit +def test_root_validate_sql_hint_with_requested_and_present() -> None: + """When engine_hint='sql', return_unexpected_index_query=True, and + unexpected_index_query is present, the validator should pass.""" + values: Dict[str, Any] = { + "engine_hint": "sql", + "return_unexpected_index_query": True, + "unexpected_index_query": "SELECT * FROM ...", + } + result = root_validate_engine_required_fields(None, values) + assert result == values + + +@pytest.mark.unit +def test_root_validate_sql_hint_with_requested_but_missing_raises() -> None: + """When engine_hint='sql', return_unexpected_index_query=True, but + unexpected_index_query is absent (None), the validator should raise ValueError.""" + values: Dict[str, Any] = { + "engine_hint": "sql", + "return_unexpected_index_query": True, + "unexpected_index_query": None, + } + with pytest.raises((ValueError, pydantic.ValidationError)): + root_validate_engine_required_fields(None, values) + + +@pytest.mark.unit +def test_root_validate_sql_hint_without_requested_is_noop() -> None: + """When engine_hint='sql' but return_unexpected_index_query is False/absent, + the validator should pass even without unexpected_index_query.""" + values: Dict[str, Any] = { + "engine_hint": "sql", + "return_unexpected_index_query": False, + "unexpected_index_query": None, + } + result = root_validate_engine_required_fields(None, values) + assert result == values + + +@pytest.mark.unit +def test_root_validate_non_sql_hint_with_requested_but_missing_is_noop() -> None: + """When engine_hint is not 'sql' (e.g., 'pandas'), the SQL assertion is skipped.""" + values: Dict[str, Any] = { + "engine_hint": "pandas", + "return_unexpected_index_query": True, + "unexpected_index_query": None, + } + result = root_validate_engine_required_fields(None, values) + assert result == values + + +@pytest.mark.unit +def test_root_validate_via_model_no_hint() -> None: + """Integration check: model construction without engine hint passes.""" + m = _EngineHintModel( + return_unexpected_index_query=True, + unexpected_index_query=None, + ) + assert m.return_unexpected_index_query is True + assert m.unexpected_index_query is None + + +@pytest.mark.unit +def test_root_validate_via_model_sql_enforcement_fires() -> None: + """Model-level SQL enforcement: engine_hint='sql' + return_unexpected_index_query=True + + unexpected_index_query=None must raise pydantic.ValidationError. + + This test verifies that engine_hint is a real pydantic field (not a private + attribute with underscore prefix), so the root_validator actually receives it + in the values dict and can enforce the SQL-required-field constraint. + """ + with pytest.raises(pydantic.ValidationError): + _EngineHintModel( + engine_hint="sql", + return_unexpected_index_query=True, + unexpected_index_query=None, + ) diff --git a/tests/unit/core/validation_result_schemas/test_findings_emitter.py b/tests/unit/core/validation_result_schemas/test_findings_emitter.py new file mode 100644 index 000000000000..d5425612b737 --- /dev/null +++ b/tests/unit/core/validation_result_schemas/test_findings_emitter.py @@ -0,0 +1,311 @@ +"""Unit tests for findings_emitter.py. + +Covers: +- Round-trip: write N findings via context manager, read back JSON, assert structure +- Determinism: two identical runs produce byte-identical output (modulo timestamps) +- Env-var resolution: GX_VALIDATION_FINDINGS_DIR overrides default +- Atomic write: if Path.replace raises, the destination file is unchanged + +All tests are marked @pytest.mark.unit and run via: + pytest tests/unit/core/validation_result_schemas/test_findings_emitter.py -m unit +""" + +from __future__ import annotations + +import json +from datetime import datetime, timezone +from pathlib import Path +from typing import TYPE_CHECKING, List +from unittest.mock import patch + +import pytest + +from great_expectations.core.validation_result_schemas.findings_emitter import ( + _DEFAULT_DIR, + _ENV_VAR, + SCHEMA_VERSION, + FindingsWriter, +) + +if TYPE_CHECKING: + from great_expectations.core.validation_result_schemas.types import Finding + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +_SAMPLE_FINDINGS: List[Finding] = [ + { + "expectation_type": "expect_column_values_to_not_be_null", + "result_format": "COMPLETE", + "engine": "pandas", + "datasource_test_id": "ds-001", + "status": "parsed", + }, + { + "expectation_type": "expect_column_to_exist", + "result_format": "BASIC", + "engine": "spark", + "datasource_test_id": "ds-002", + "status": "parsed", + }, + { + "expectation_type": "expect_column_values_to_be_in_set", + "result_format": "SUMMARY", + "engine": "pandas", + "datasource_test_id": "ds-003", + "status": "failed", + "error_summary": "schema mismatch", + }, +] + +_FIXED_TS = "2026-05-07T14:23:11Z" + + +def _mock_now(*args, **kwargs): + """Return a fixed datetime for deterministic timestamp tests.""" + return datetime(2026, 5, 7, 14, 23, 11, tzinfo=timezone.utc) + + +# --------------------------------------------------------------------------- +# 1. Round-trip: write N findings, read back JSON, assert structure +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_round_trip_findings(tmp_path: Path) -> None: + """Write findings via context manager and verify the JSON envelope.""" + run_id = "test-round-trip-run" + with patch( + "great_expectations.core.validation_result_schemas.findings_emitter.datetime" + ) as mock_dt: + mock_dt.now.return_value = _mock_now() + mock_dt.now.side_effect = _mock_now + + with FindingsWriter(run_id, output_dir=tmp_path) as writer: + for finding in _SAMPLE_FINDINGS: + writer.write_finding(finding) + + output_file = tmp_path / f"{run_id}.json" + assert output_file.exists(), "Output file should exist after close()" + + with output_file.open() as f: + data = json.load(f) + + # Envelope fields + assert data["schema_version"] == SCHEMA_VERSION + assert data["run_id"] == run_id + assert "started_at_utc" in data + assert "completed_at_utc" in data + assert "gx_version" in data + assert isinstance(data["gx_version"], str) + assert isinstance(data["findings"], list) + assert len(data["findings"]) == len(_SAMPLE_FINDINGS) + + # Spot-check one finding field + types_in_output = {f["expectation_type"] for f in data["findings"]} + assert "expect_column_values_to_not_be_null" in types_in_output + assert "expect_column_to_exist" in types_in_output + + +# --------------------------------------------------------------------------- +# 2. Determinism: two runs with same findings produce identical findings list +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_deterministic_output(tmp_path: Path) -> None: + """Two runs with same findings produce byte-identical findings lists.""" + run_id = "deterministic-run" + dirs = [tmp_path / "run1", tmp_path / "run2"] + for d in dirs: + d.mkdir() + + for output_dir in dirs: + with patch( + "great_expectations.core.validation_result_schemas.findings_emitter.datetime" + ) as mock_dt: + mock_dt.now.side_effect = _mock_now + + with FindingsWriter(run_id, output_dir=output_dir) as writer: + for finding in _SAMPLE_FINDINGS: + writer.write_finding(finding) + + file1 = dirs[0] / f"{run_id}.json" + file2 = dirs[1] / f"{run_id}.json" + + data1 = json.loads(file1.read_text()) + data2 = json.loads(file2.read_text()) + + # Findings lists should be identical (same sort order) + assert data1["findings"] == data2["findings"] + + # With mocked timestamps, full envelope should also be identical + assert data1 == data2 + + +@pytest.mark.unit +def test_findings_sorted_by_sort_key(tmp_path: Path) -> None: + """Findings are sorted by (expectation_type, engine, result_format).""" + run_id = "sorted-run" + + # Add findings in reverse alphabetical order to confirm sorting + findings_reversed = list(reversed(_SAMPLE_FINDINGS)) + + with FindingsWriter(run_id, output_dir=tmp_path) as writer: + for finding in findings_reversed: + writer.write_finding(finding) + + data = json.loads((tmp_path / f"{run_id}.json").read_text()) + sort_keys = [ + (f.get("expectation_type", ""), f.get("engine", ""), f.get("result_format", "")) + for f in data["findings"] + ] + assert sort_keys == sorted(sort_keys) + + +# --------------------------------------------------------------------------- +# 3. Env-var resolution +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_env_var_resolution(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + """GX_VALIDATION_FINDINGS_DIR env var is used when output_dir is None.""" + env_dir = tmp_path / "env_output" + env_dir.mkdir() + monkeypatch.setenv(_ENV_VAR, str(env_dir)) + + run_id = "env-var-run" + with FindingsWriter(run_id) as writer: + writer.write_finding(_SAMPLE_FINDINGS[0]) + + assert (env_dir / f"{run_id}.json").exists() + + +@pytest.mark.unit +def test_explicit_output_dir_overrides_env_var( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """Explicit output_dir takes precedence over env var.""" + env_dir = tmp_path / "env_output" + env_dir.mkdir() + explicit_dir = tmp_path / "explicit_output" + explicit_dir.mkdir() + + monkeypatch.setenv(_ENV_VAR, str(env_dir)) + + run_id = "explicit-override-run" + with FindingsWriter(run_id, output_dir=explicit_dir) as writer: + writer.write_finding(_SAMPLE_FINDINGS[0]) + + assert (explicit_dir / f"{run_id}.json").exists() + assert not (env_dir / f"{run_id}.json").exists() + + +@pytest.mark.unit +def test_default_dir_used_when_no_env_var(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: + """When neither arg nor env var is set, _DEFAULT_DIR is used.""" + monkeypatch.delenv(_ENV_VAR, raising=False) + + run_id = "default-dir-run" + # We can't easily test the true default without writing to the actual filesystem, + # so we verify that FindingsWriter resolves to _DEFAULT_DIR by checking + # the resolved path stored on the instance. + with patch("os.makedirs"): # prevent actual dir creation + writer = FindingsWriter(run_id) + assert writer._output_dir == Path(_DEFAULT_DIR) + + +# --------------------------------------------------------------------------- +# 4. Atomic write: if Path.replace raises, destination file is unchanged +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_atomic_write_no_partial_file_on_failure(tmp_path: Path) -> None: + """If Path.replace raises, the destination file is not created/corrupted.""" + run_id = "atomic-fail-run" + dest_file = tmp_path / f"{run_id}.json" + assert not dest_file.exists() + + with patch( + "great_expectations.core.validation_result_schemas.findings_emitter.Path.replace", + side_effect=OSError("simulated replace failure"), + ): + writer = FindingsWriter(run_id, output_dir=tmp_path) + writer.write_finding(_SAMPLE_FINDINGS[0]) + with pytest.raises(OSError, match="simulated replace failure"): + writer.close() + + # Destination should not exist (atomic write failed before rename) + assert not dest_file.exists(), "Destination file must not exist after failed atomic write" + + +@pytest.mark.unit +def test_atomic_write_preserves_existing_on_failure(tmp_path: Path) -> None: + """If Path.replace raises when overwriting, old content is preserved.""" + run_id = "atomic-overwrite-run" + dest_file = tmp_path / f"{run_id}.json" + original_content = '{"old": "content"}' + dest_file.write_text(original_content) + + with patch( + "great_expectations.core.validation_result_schemas.findings_emitter.Path.replace", + side_effect=OSError("simulated replace failure"), + ): + writer = FindingsWriter(run_id, output_dir=tmp_path) + writer.write_finding(_SAMPLE_FINDINGS[0]) + with pytest.raises(OSError): + writer.close() + + assert dest_file.read_text() == original_content, ( + "Existing file must be unchanged after failed atomic write" + ) + + +# --------------------------------------------------------------------------- +# 5. Context-manager protocol +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_context_manager_calls_close(tmp_path: Path) -> None: + """Exiting context manager calls close() and produces output.""" + run_id = "ctx-manager-run" + with FindingsWriter(run_id, output_dir=tmp_path) as writer: + writer.write_finding(_SAMPLE_FINDINGS[0]) + + assert (tmp_path / f"{run_id}.json").exists() + + +@pytest.mark.unit +def test_context_manager_propagates_exception(tmp_path: Path) -> None: + """Exception inside context manager propagates after close().""" + run_id = "ctx-exception-run" + with pytest.raises(ValueError, match="test error"): + with FindingsWriter(run_id, output_dir=tmp_path) as writer: + writer.write_finding(_SAMPLE_FINDINGS[0]) + raise ValueError("test error") + + +# --------------------------------------------------------------------------- +# 6. Module constants +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_schema_version_is_int() -> None: + assert isinstance(SCHEMA_VERSION, int) + assert SCHEMA_VERSION == 1 + + +@pytest.mark.unit +def test_default_dir_is_path() -> None: + assert isinstance(_DEFAULT_DIR, Path) + + +@pytest.mark.unit +def test_env_var_name() -> None: + assert _ENV_VAR == "GX_VALIDATION_FINDINGS_DIR" diff --git a/tests/unit/core/validation_result_schemas/test_format_config.py b/tests/unit/core/validation_result_schemas/test_format_config.py new file mode 100644 index 000000000000..e2670915dd03 --- /dev/null +++ b/tests/unit/core/validation_result_schemas/test_format_config.py @@ -0,0 +1,186 @@ +"""Unit tests for ResultFormatConfig TypedDict. + +Round-trips parse_result_format() output under each ResultFormat value, +asserting required keys are present and optional keys behave correctly. +""" + +from __future__ import annotations + +import pytest + +from great_expectations.core.result_format import ResultFormat +from great_expectations.core.validation_result_schemas.format_config import ( + ResultFormatConfig, + ResultFormatConfigRequired, +) +from great_expectations.expectations.expectation_configuration import parse_result_format + +REQUIRED_KEYS = frozenset( + { + "result_format", + "partial_unexpected_count", + "include_unexpected_rows", + "map_expectation_unexpected_rows_as_dict", + } +) +OPTIONAL_KEYS = frozenset({"exclude_unexpected_values", "return_unexpected_index_query"}) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _assert_required_keys_present(config: dict) -> None: + """Assert all required keys are present in the config dict.""" + missing = REQUIRED_KEYS - config.keys() + assert not missing, f"Missing required keys: {missing}" + + +def _assert_optional_keys_absent(config: dict) -> None: + """Assert optional keys are NOT present (string-only parse_result_format input).""" + present = OPTIONAL_KEYS & config.keys() + assert not present, f"Optional keys should be absent but found: {present}" + + +# --------------------------------------------------------------------------- +# Tests: string-form parse_result_format produces only required keys +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_boolean_only_required_keys_present() -> None: + raw = parse_result_format(ResultFormat.BOOLEAN_ONLY.value) + config: ResultFormatConfig = raw # type: ignore[assignment] + _assert_required_keys_present(config) + _assert_optional_keys_absent(config) + assert config["result_format"] == ResultFormat.BOOLEAN_ONLY.value + + +@pytest.mark.unit +def test_basic_required_keys_present() -> None: + raw = parse_result_format(ResultFormat.BASIC.value) + config: ResultFormatConfig = raw # type: ignore[assignment] + _assert_required_keys_present(config) + _assert_optional_keys_absent(config) + assert config["result_format"] == ResultFormat.BASIC.value + + +@pytest.mark.unit +def test_summary_required_keys_present() -> None: + raw = parse_result_format(ResultFormat.SUMMARY.value) + config: ResultFormatConfig = raw # type: ignore[assignment] + _assert_required_keys_present(config) + _assert_optional_keys_absent(config) + assert config["result_format"] == ResultFormat.SUMMARY.value + + +@pytest.mark.unit +def test_complete_required_keys_present() -> None: + raw = parse_result_format(ResultFormat.COMPLETE.value) + config: ResultFormatConfig = raw # type: ignore[assignment] + _assert_required_keys_present(config) + _assert_optional_keys_absent(config) + assert config["result_format"] == ResultFormat.COMPLETE.value + + +# --------------------------------------------------------------------------- +# Tests: dict-form parse_result_format with optional keys present +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_optional_exclude_unexpected_values_present_when_supplied() -> None: + raw = parse_result_format( + { + "result_format": ResultFormat.COMPLETE.value, + "exclude_unexpected_values": True, + } + ) + config: ResultFormatConfig = raw # type: ignore[assignment] + _assert_required_keys_present(config) + assert "exclude_unexpected_values" in config + assert config["exclude_unexpected_values"] is True # type: ignore[typeddict-item] + + +@pytest.mark.unit +def test_optional_return_unexpected_index_query_present_when_supplied() -> None: + raw = parse_result_format( + { + "result_format": ResultFormat.COMPLETE.value, + "return_unexpected_index_query": False, + } + ) + config: ResultFormatConfig = raw # type: ignore[assignment] + _assert_required_keys_present(config) + assert "return_unexpected_index_query" in config + assert config["return_unexpected_index_query"] is False # type: ignore[typeddict-item] + + +@pytest.mark.unit +def test_both_optional_keys_present_when_supplied() -> None: + raw = parse_result_format( + { + "result_format": ResultFormat.SUMMARY.value, + "exclude_unexpected_values": False, + "return_unexpected_index_query": True, + } + ) + config: ResultFormatConfig = raw # type: ignore[assignment] + _assert_required_keys_present(config) + assert "exclude_unexpected_values" in config + assert "return_unexpected_index_query" in config + + +# --------------------------------------------------------------------------- +# Tests: partial_unexpected_count default +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_partial_unexpected_count_defaults_to_20() -> None: + raw = parse_result_format(ResultFormat.BASIC.value) + assert raw["partial_unexpected_count"] == 20 + + +@pytest.mark.unit +def test_partial_unexpected_count_preserved_when_supplied() -> None: + raw = parse_result_format( + { + "result_format": ResultFormat.BASIC.value, + "partial_unexpected_count": 5, + } + ) + assert raw["partial_unexpected_count"] == 5 + + +# --------------------------------------------------------------------------- +# Tests: TypedDict structural constraints +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_result_format_config_required_is_typeddict() -> None: + """Confirm ResultFormatConfigRequired is a TypedDict (not a runtime check, but importable).""" + # Verify the class exists and has the expected annotations + annotations = ResultFormatConfigRequired.__annotations__ + assert "result_format" in annotations + assert "partial_unexpected_count" in annotations + assert "include_unexpected_rows" in annotations + assert "map_expectation_unexpected_rows_as_dict" in annotations + + +@pytest.mark.unit +def test_result_format_config_extends_required() -> None: + """Confirm ResultFormatConfig inherits required keys from ResultFormatConfigRequired.""" + # TypedDict merges required keys from bases into __required_keys__; works on 3.10+. + assert ResultFormatConfigRequired.__required_keys__ <= ResultFormatConfig.__required_keys__ + + +@pytest.mark.unit +def test_result_format_config_has_optional_keys_in_annotations() -> None: + """Confirm ResultFormatConfig declares optional keys.""" + # ResultFormatConfig (total=False subclass) owns the optional fields + own_annotations = ResultFormatConfig.__annotations__ + assert "exclude_unexpected_values" in own_annotations + assert "return_unexpected_index_query" in own_annotations diff --git a/tests/unit/core/validation_result_schemas/test_runner_helpers.py b/tests/unit/core/validation_result_schemas/test_runner_helpers.py new file mode 100644 index 000000000000..544242c5dd78 --- /dev/null +++ b/tests/unit/core/validation_result_schemas/test_runner_helpers.py @@ -0,0 +1,210 @@ +"""Unit tests for matrix runner helpers. + +Covers: +- assert_field_set_covered: all raw keys present in model passes; + missing raw key raises AssertionError with key name; model extras are ignored +- summarize_raw_dict: empty dict, scalar/list/dict values, None values; + structure only — never values +- _normalize_engine_hint: pandas passthrough, spark/dataframe normalization, + all SQL dialects collapse to 'sql', unknown types returned as-is + +All tests are marked @pytest.mark.unit and run via: + pytest tests/unit/core/validation_result_schemas/test_runner_helpers.py -m unit +""" + +from __future__ import annotations + +from typing import Any, Dict, Optional + +import pytest + +from great_expectations.compatibility import pydantic +from great_expectations.core.validation_result_schemas.types import RuntimeTypeName +from tests.integration.data_sources_and_expectations.expectations import ( + _validation_result_schemas_helpers as _helpers, +) + +_normalize_engine_hint = _helpers._normalize_engine_hint +assert_field_set_covered = _helpers.assert_field_set_covered +summarize_raw_dict = _helpers.summarize_raw_dict + +# --------------------------------------------------------------------------- +# Minimal pydantic model for exercising assert_field_set_covered +# --------------------------------------------------------------------------- + + +class _SimpleModel(pydantic.BaseModel): + """Minimal model with a known field set, plus an extra engine_hint field.""" + + success: Optional[bool] = None + result: Optional[Dict[str, Any]] = None + exception_info: Optional[Dict[str, Any]] = None + engine_hint: Optional[str] = None + + +# --------------------------------------------------------------------------- +# assert_field_set_covered +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_assert_field_set_covered_all_present() -> None: + """When all raw keys exist in the model dict, no assertion is raised.""" + raw = {"success": True, "result": {"observed_value": 42}} + model = _SimpleModel(success=True, result={"observed_value": 42}) + # Should not raise + assert_field_set_covered(raw, model) + + +@pytest.mark.unit +def test_assert_field_set_covered_model_extra_keys_are_ok() -> None: + """Model may have extra keys (engine_hint) not in raw — that's fine.""" + raw = {"success": True} + model = _SimpleModel(success=True, engine_hint="pandas") + # engine_hint is in model but not in raw — should not raise + assert_field_set_covered(raw, model) + + +@pytest.mark.unit +def test_assert_field_set_covered_missing_raw_key_raises() -> None: + """A raw key absent from the model dict causes AssertionError.""" + + class _NarrowModel(pydantic.BaseModel): + success: Optional[bool] = None + + raw = {"success": True, "missing_field": "some_value"} + model = _NarrowModel(success=True) + with pytest.raises(AssertionError, match="missing_field"): + assert_field_set_covered(raw, model) + + +@pytest.mark.unit +def test_assert_field_set_covered_multiple_missing_keys_reported() -> None: + """All absent keys are reported together in the AssertionError message.""" + + class _EmptyModel(pydantic.BaseModel): + pass + + raw = {"key_a": 1, "key_b": 2} + model = _EmptyModel() + with pytest.raises(AssertionError) as exc_info: + assert_field_set_covered(raw, model) + msg = str(exc_info.value) + assert "key_a" in msg + assert "key_b" in msg + + +# --------------------------------------------------------------------------- +# summarize_raw_dict +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_summarize_raw_dict_empty() -> None: + """Empty dict returns empty raw_field_set and raw_field_types.""" + result = summarize_raw_dict({}) + assert result == {"raw_field_set": [], "raw_field_types": {}} + + +@pytest.mark.unit +def test_summarize_raw_dict_scalar_values() -> None: + """Scalar values are classified to the correct RuntimeTypeName.""" + raw = { + "an_int": 42, + "a_float": 3.14, + "a_str": "hello", + "a_bool": True, + } + result = summarize_raw_dict(raw) + assert result["raw_field_set"] == sorted(raw.keys()) + assert result["raw_field_types"]["an_int"] == RuntimeTypeName.INT.value + assert result["raw_field_types"]["a_float"] == RuntimeTypeName.FLOAT.value + assert result["raw_field_types"]["a_str"] == RuntimeTypeName.STR.value + assert result["raw_field_types"]["a_bool"] == RuntimeTypeName.BOOL.value + + +@pytest.mark.unit +def test_summarize_raw_dict_collection_values() -> None: + """list and dict values are classified correctly.""" + raw = { + "a_list": [1, 2, 3], + "a_dict": {"nested": True}, + } + result = summarize_raw_dict(raw) + assert result["raw_field_types"]["a_list"] == RuntimeTypeName.LIST.value + assert result["raw_field_types"]["a_dict"] == RuntimeTypeName.DICT.value + + +@pytest.mark.unit +def test_summarize_raw_dict_none_values() -> None: + """None values are classified as RuntimeTypeName.NONE.""" + raw = {"nullable_field": None} + result = summarize_raw_dict(raw) + assert result["raw_field_types"]["nullable_field"] == RuntimeTypeName.NONE.value + + +@pytest.mark.unit +def test_summarize_raw_dict_field_set_is_sorted() -> None: + """raw_field_set must be in sorted order regardless of insertion order.""" + raw = {"z_last": 1, "a_first": 2, "m_middle": 3} + result = summarize_raw_dict(raw) + assert result["raw_field_set"] == ["a_first", "m_middle", "z_last"] + + +@pytest.mark.unit +def test_summarize_raw_dict_never_includes_values() -> None: + """The result dict must not contain raw field values — only structure.""" + raw = {"secret_value": "do_not_leak_this"} + result = summarize_raw_dict(raw) + # Values should not appear anywhere in the output + assert "do_not_leak_this" not in str(result) + # But the key (structure) should be present + assert "secret_value" in result["raw_field_set"] + + +# --------------------------------------------------------------------------- +# _normalize_engine_hint +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_normalize_engine_hint_pandas() -> None: + assert _normalize_engine_hint("pandas") == "pandas" + + +@pytest.mark.unit +def test_normalize_engine_hint_spark() -> None: + assert _normalize_engine_hint("spark") == "spark" + + +@pytest.mark.unit +def test_normalize_engine_hint_dataframe_to_spark() -> None: + assert _normalize_engine_hint("dataframe") == "spark" + + +@pytest.mark.unit +@pytest.mark.parametrize( + "dialect", + [ + "sql", + "snowflake", + "postgres", + "redshift", + "databricks_sql", + "sqlite", + "bigquery", + "mysql", + "mssql", + ], +) +def test_normalize_engine_hint_sql_dialects(dialect: str) -> None: + """All SQL dialects collapse to 'sql'.""" + assert _normalize_engine_hint(dialect) == "sql" + + +@pytest.mark.unit +def test_normalize_engine_hint_unknown_passthrough() -> None: + """Unknown engine types are returned as-is.""" + assert _normalize_engine_hint("unknown_engine_xyz") == "unknown_engine_xyz" + assert _normalize_engine_hint("dask") == "dask" + assert _normalize_engine_hint("") == "" diff --git a/tests/unit/core/validation_result_schemas/test_schemas_aggregate.py b/tests/unit/core/validation_result_schemas/test_schemas_aggregate.py new file mode 100644 index 000000000000..458d39dcce8d --- /dev/null +++ b/tests/unit/core/validation_result_schemas/test_schemas_aggregate.py @@ -0,0 +1,388 @@ +"""Unit tests for the AggregateResult schema family. + +Covers: +- Each format variant (AggregateBooleanOnlyResult, AggregateBasicResult, + AggregateSummaryResult, AggregateCompleteResult) parses a valid result dict correctly. +- All expected fields match the input. +- Unknown extra fields raise pydantic.ValidationError (extra=forbid). +- ObservedValue union accepts scalar (int, float, str, bool), list, dict, and None shapes. +- Details field is optional and accepts None or dict. +- Every format variant can be constructed with minimal (empty) args. + +All tests are marked @pytest.mark.unit and run via: + pytest tests/unit/core/validation_result_schemas/test_schemas_aggregate.py -m unit +""" + +from __future__ import annotations + +import pytest + +from great_expectations.compatibility import pydantic +from great_expectations.core.validation_result_schemas.schemas.aggregate_result import ( + AggregateBasicResult, + AggregateBooleanOnlyResult, + AggregateCompleteResult, + AggregateResultBase, + AggregateSummaryResult, +) + +# --------------------------------------------------------------------------- +# Shared fixture data +# --------------------------------------------------------------------------- + +_BASIC_RESULT_DATA = { + "element_count": 200, + "missing_count": 10, + "missing_percent": 5.0, + "partial_unexpected_list": ["a", "b"], + "partial_missing_list": [None], +} + +_COMPLETE_EXTRA_DATA = { + "unexpected_list": ["a", "b", "c"], + "unexpected_index_list": [0, 1, 2], +} + + +# --------------------------------------------------------------------------- +# ObservedValue union shapes +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_observed_value_int() -> None: + """ObservedValue accepts int scalar.""" + m = AggregateResultBase(observed_value=42) + assert m.observed_value == 42 + + +@pytest.mark.unit +def test_observed_value_float() -> None: + """ObservedValue accepts float scalar.""" + m = AggregateResultBase(observed_value=3.14) + assert m.observed_value == 3.14 + + +@pytest.mark.unit +def test_observed_value_str() -> None: + """ObservedValue accepts string scalar.""" + m = AggregateResultBase(observed_value="mean=3.14") + assert m.observed_value == "mean=3.14" + + +@pytest.mark.unit +def test_observed_value_bool() -> None: + """ObservedValue accepts bool scalar.""" + m = AggregateResultBase(observed_value=True) + assert m.observed_value is True + + +@pytest.mark.unit +def test_observed_value_list() -> None: + """ObservedValue accepts list.""" + m = AggregateResultBase(observed_value=[1, 2, 3]) + assert m.observed_value == [1, 2, 3] + + +@pytest.mark.unit +def test_observed_value_dict() -> None: + """ObservedValue accepts dict.""" + m = AggregateResultBase(observed_value={"min": 0, "max": 10}) + assert m.observed_value == {"min": 0, "max": 10} + + +@pytest.mark.unit +def test_observed_value_none() -> None: + """ObservedValue defaults to None.""" + m = AggregateResultBase() + assert m.observed_value is None + + +@pytest.mark.unit +def test_observed_value_explicit_none() -> None: + """ObservedValue accepts explicit None.""" + m = AggregateResultBase(observed_value=None) + assert m.observed_value is None + + +# --------------------------------------------------------------------------- +# Details field +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_details_absent_defaults_to_none() -> None: + """details field defaults to None when not provided.""" + m = AggregateResultBase() + assert m.details is None + + +@pytest.mark.unit +def test_details_present_with_dict() -> None: + """details field accepts a dict.""" + m = AggregateResultBase(details={"percentile": 0.95, "min": 0, "max": 100}) + assert m.details == {"percentile": 0.95, "min": 0, "max": 100} + + +@pytest.mark.unit +def test_details_present_empty_dict() -> None: + """details field accepts an empty dict.""" + m = AggregateResultBase(details={}) + assert m.details == {} + + +@pytest.mark.unit +def test_details_explicit_none() -> None: + """details field accepts explicit None.""" + m = AggregateResultBase(details=None) + assert m.details is None + + +# --------------------------------------------------------------------------- +# AggregateResultBase extra=forbid +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_aggregate_result_base_extra_forbid() -> None: + """AggregateResultBase enforces extra=forbid.""" + with pytest.raises(pydantic.ValidationError): + AggregateResultBase(completely_unknown="value") + + +# --------------------------------------------------------------------------- +# AggregateBooleanOnlyResult +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_aggregate_boolean_only_empty() -> None: + """BOOLEAN_ONLY result is typically empty.""" + m = AggregateBooleanOnlyResult() + assert m.observed_value is None + assert m.details is None + + +@pytest.mark.unit +def test_aggregate_boolean_only_with_observed_value() -> None: + """AggregateBooleanOnlyResult inherits observed_value from base.""" + m = AggregateBooleanOnlyResult(observed_value=42) + assert m.observed_value == 42 + + +@pytest.mark.unit +def test_aggregate_boolean_only_with_details() -> None: + """AggregateBooleanOnlyResult inherits details from base.""" + m = AggregateBooleanOnlyResult(details={"info": "extra"}) + assert m.details == {"info": "extra"} + + +@pytest.mark.unit +def test_aggregate_boolean_only_extra_field_raises() -> None: + """extra=forbid: unknown fields raise ValidationError in AggregateBooleanOnlyResult.""" + with pytest.raises(pydantic.ValidationError): + AggregateBooleanOnlyResult(unknown_field="should_fail") + + +@pytest.mark.unit +def test_aggregate_boolean_only_basic_fields_rejected() -> None: + """AggregateBooleanOnlyResult does not accept AggregateBasicResult-only fields.""" + with pytest.raises(pydantic.ValidationError): + AggregateBooleanOnlyResult(element_count=100) + + +# --------------------------------------------------------------------------- +# AggregateBasicResult +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_aggregate_basic_parses_valid_result() -> None: + """AggregateBasicResult parses a typical BASIC result dict correctly.""" + m = AggregateBasicResult(**_BASIC_RESULT_DATA) + assert m.element_count == 200 + assert m.missing_count == 10 + assert m.missing_percent == 5.0 + assert m.partial_unexpected_list == ["a", "b"] + assert m.partial_missing_list == [None] + + +@pytest.mark.unit +def test_aggregate_basic_all_fields_none() -> None: + """All fields are Optional so AggregateBasicResult can be constructed with no args.""" + m = AggregateBasicResult() + assert m.element_count is None + assert m.missing_count is None + assert m.missing_percent is None + assert m.partial_unexpected_list is None + assert m.partial_missing_list is None + + +@pytest.mark.unit +def test_aggregate_basic_with_observed_value() -> None: + """AggregateBasicResult inherits observed_value from base.""" + m = AggregateBasicResult(observed_value=3.14, element_count=100) + assert m.observed_value == 3.14 + assert m.element_count == 100 + + +@pytest.mark.unit +def test_aggregate_basic_extra_field_raises() -> None: + """extra=forbid: unknown fields raise ValidationError in AggregateBasicResult.""" + with pytest.raises(pydantic.ValidationError): + AggregateBasicResult(**_BASIC_RESULT_DATA, unknown_field="bad") + + +@pytest.mark.unit +def test_aggregate_basic_complete_only_field_raises() -> None: + """AggregateBasicResult does not accept AggregateCompleteResult-only fields.""" + with pytest.raises(pydantic.ValidationError): + AggregateBasicResult(**_BASIC_RESULT_DATA, unexpected_list=[1, 2, 3]) + + +# --------------------------------------------------------------------------- +# AggregateSummaryResult +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_aggregate_summary_parses_valid_result() -> None: + """AggregateSummaryResult parses a typical SUMMARY result dict correctly.""" + m = AggregateSummaryResult(**_BASIC_RESULT_DATA) + assert m.element_count == 200 + assert m.missing_count == 10 + assert m.partial_unexpected_list == ["a", "b"] + + +@pytest.mark.unit +def test_aggregate_summary_all_optional() -> None: + """All fields in AggregateSummaryResult are Optional.""" + m = AggregateSummaryResult() + assert m.element_count is None + assert m.missing_count is None + assert m.partial_unexpected_list is None + + +@pytest.mark.unit +def test_aggregate_summary_extra_field_raises() -> None: + """extra=forbid: unknown fields raise ValidationError in AggregateSummaryResult.""" + with pytest.raises(pydantic.ValidationError): + AggregateSummaryResult(**_BASIC_RESULT_DATA, unknown_field="bad") + + +@pytest.mark.unit +def test_aggregate_summary_with_observed_value_and_details() -> None: + """AggregateSummaryResult inherits base fields.""" + m = AggregateSummaryResult( + observed_value={"mean": 42.0}, + details={"row_count": 1000}, + element_count=1000, + ) + assert m.observed_value == {"mean": 42.0} + assert m.details == {"row_count": 1000} + assert m.element_count == 1000 + + +@pytest.mark.unit +def test_aggregate_summary_complete_only_field_raises() -> None: + """AggregateSummaryResult does not accept AggregateCompleteResult-only fields.""" + with pytest.raises(pydantic.ValidationError): + AggregateSummaryResult(**_BASIC_RESULT_DATA, unexpected_list=[1, 2, 3]) + + +# --------------------------------------------------------------------------- +# AggregateCompleteResult +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_aggregate_complete_parses_valid_result() -> None: + """AggregateCompleteResult parses a typical COMPLETE result dict correctly.""" + data = {**_BASIC_RESULT_DATA, **_COMPLETE_EXTRA_DATA} + m = AggregateCompleteResult(**data) + assert m.element_count == 200 + assert m.missing_count == 10 + assert m.unexpected_list == ["a", "b", "c"] + assert m.unexpected_index_list == [0, 1, 2] + + +@pytest.mark.unit +def test_aggregate_complete_all_optional() -> None: + """All fields in AggregateCompleteResult are Optional.""" + m = AggregateCompleteResult() + assert m.unexpected_list is None + assert m.unexpected_index_list is None + assert m.element_count is None + + +@pytest.mark.unit +def test_aggregate_complete_extra_field_raises() -> None: + """extra=forbid: unknown fields raise ValidationError in AggregateCompleteResult.""" + data = {**_BASIC_RESULT_DATA, **_COMPLETE_EXTRA_DATA} + with pytest.raises(pydantic.ValidationError): + AggregateCompleteResult(**data, not_a_real_field="value") + + +@pytest.mark.unit +def test_aggregate_complete_inherits_all_ancestor_fields() -> None: + """AggregateCompleteResult inherits fields from all ancestor classes.""" + data = { + **_BASIC_RESULT_DATA, + **_COMPLETE_EXTRA_DATA, + "observed_value": 3.14, + "details": {"info": "complete"}, + } + m = AggregateCompleteResult(**data) + # From AggregateResultBase + assert m.observed_value == 3.14 + assert m.details == {"info": "complete"} + # From AggregateBasicResult + assert m.element_count == 200 + assert m.partial_unexpected_list == ["a", "b"] + assert m.partial_missing_list == [None] + # From AggregateCompleteResult + assert m.unexpected_list == ["a", "b", "c"] + assert m.unexpected_index_list == [0, 1, 2] + + +@pytest.mark.unit +def test_aggregate_complete_with_list_observed_value() -> None: + """AggregateCompleteResult accepts list observed_value.""" + m = AggregateCompleteResult(observed_value=["a", "b", "c"]) + assert m.observed_value == ["a", "b", "c"] + + +# --------------------------------------------------------------------------- +# Inheritance chain sanity +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_inheritance_chain() -> None: + """Inheritance chain: Complete → Summary → Basic → ResultBase.""" + assert issubclass(AggregateCompleteResult, AggregateSummaryResult) + assert issubclass(AggregateSummaryResult, AggregateBasicResult) + assert issubclass(AggregateBasicResult, AggregateResultBase) + assert issubclass(AggregateBooleanOnlyResult, AggregateResultBase) + + +@pytest.mark.unit +def test_aggregate_complete_is_not_aggregate_boolean_only() -> None: + """AggregateCompleteResult and AggregateBooleanOnlyResult are separate leaf classes.""" + assert not issubclass(AggregateCompleteResult, AggregateBooleanOnlyResult) + assert not issubclass(AggregateBooleanOnlyResult, AggregateCompleteResult) + + +# --------------------------------------------------------------------------- +# extra=forbid on AggregateResultBase +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_aggregate_summary_is_subclass_of_basic() -> None: + """AggregateSummaryResult is a subclass of AggregateBasicResult (no new fields).""" + assert issubclass(AggregateSummaryResult, AggregateBasicResult) + # verify they have the same fields (summary adds no new fields) + assert set(AggregateSummaryResult.__fields__.keys()) == set( + AggregateBasicResult.__fields__.keys() + ) diff --git a/tests/unit/core/validation_result_schemas/test_schemas_map.py b/tests/unit/core/validation_result_schemas/test_schemas_map.py new file mode 100644 index 000000000000..ac0b666598a2 --- /dev/null +++ b/tests/unit/core/validation_result_schemas/test_schemas_map.py @@ -0,0 +1,394 @@ +"""Unit tests for the MapResult schema family. + +Covers: +- Each format variant (MapBooleanOnlyResult, MapBasicResult, MapSummaryResult, + MapCompleteResult) parses a valid result dict correctly. +- All expected fields match the input. +- Unknown extra fields raise pydantic.ValidationError (extra=forbid). +- Validator functions (validate_unexpected_rows_passthrough, + validate_partial_unexpected_counts_fallback) work as expected. +- root_validate_engine_required_fields fires when engine_hint='sql' + + return_unexpected_index_query=True but unexpected_index_query is missing. + +All tests are marked @pytest.mark.unit and run via: + pytest tests/unit/core/validation_result_schemas/test_schemas_map.py -m unit +""" + +from __future__ import annotations + +import pytest + +from great_expectations.compatibility import pydantic +from great_expectations.core.validation_result_schemas.schemas.map_result import ( + MapBasicResult, + MapBooleanOnlyResult, + MapCompleteResult, + MapResultBase, + MapSummaryResult, +) + +# --------------------------------------------------------------------------- +# Shared fixture data +# --------------------------------------------------------------------------- + +_BASIC_RESULT_DATA = { + "element_count": 100, + "unexpected_count": 5, + "unexpected_percent": 5.0, + "missing_count": 2, + "missing_percent": 2.0, + "unexpected_percent_total": 5.0, + "unexpected_percent_nonmissing": 5.0, + "partial_unexpected_list": [1, 2, 3], + "unexpected_rows": None, +} + +_SUMMARY_EXTRA_DATA = { + "partial_unexpected_counts": [{"value": 1, "count": 3}], + "partial_unexpected_index_list": [], +} + +_COMPLETE_EXTRA_DATA = { + "unexpected_list": [1, 2, 3], + "unexpected_index_list": [10, 11, 12], +} + + +# --------------------------------------------------------------------------- +# MapBooleanOnlyResult +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_map_boolean_only_empty_dict() -> None: + """BOOLEAN_ONLY result dict is typically empty.""" + m = MapBooleanOnlyResult() + assert m.unexpected_index_query is None + assert m.unexpected_index_column_names is None + assert m.engine_hint is None + + +@pytest.mark.unit +def test_map_boolean_only_with_sql_fields() -> None: + """SQL engine can set unexpected_index_query and unexpected_index_column_names.""" + m = MapBooleanOnlyResult( + unexpected_index_query="SELECT * FROM foo WHERE ...", + unexpected_index_column_names=["id"], + ) + assert m.unexpected_index_query == "SELECT * FROM foo WHERE ..." + assert m.unexpected_index_column_names == ["id"] + + +@pytest.mark.unit +def test_map_boolean_only_extra_field_raises() -> None: + """extra=forbid: unknown fields raise ValidationError.""" + with pytest.raises(pydantic.ValidationError): + MapBooleanOnlyResult(unknown_field="should_fail") + + +@pytest.mark.unit +def test_map_boolean_only_basic_result_fields_are_rejected() -> None: + """MapBooleanOnlyResult does not accept MapBasicResult-only fields.""" + with pytest.raises(pydantic.ValidationError): + MapBooleanOnlyResult(element_count=100) + + +# --------------------------------------------------------------------------- +# MapBasicResult +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_map_basic_parses_valid_result() -> None: + """MapBasicResult parses a typical pandas BASIC result dict correctly.""" + m = MapBasicResult(**_BASIC_RESULT_DATA) + assert m.element_count == 100 + assert m.unexpected_count == 5 + assert m.unexpected_percent == 5.0 + assert m.missing_count == 2 + assert m.missing_percent == 2.0 + assert m.unexpected_percent_total == 5.0 + assert m.unexpected_percent_nonmissing == 5.0 + assert m.partial_unexpected_list == [1, 2, 3] + assert m.unexpected_rows is None + + +@pytest.mark.unit +def test_map_basic_all_fields_none() -> None: + """All fields are Optional so MapBasicResult can be constructed with no args.""" + m = MapBasicResult() + assert m.element_count is None + assert m.unexpected_count is None + assert m.partial_unexpected_list is None + assert m.unexpected_rows is None + + +@pytest.mark.unit +def test_map_basic_extra_field_raises() -> None: + """extra=forbid: unknown fields raise ValidationError in MapBasicResult.""" + with pytest.raises(pydantic.ValidationError): + MapBasicResult(**_BASIC_RESULT_DATA, unknown_field="bad") + + +@pytest.mark.unit +def test_map_basic_unexpected_rows_accepts_none() -> None: + m = MapBasicResult(unexpected_rows=None) + assert m.unexpected_rows is None + + +@pytest.mark.unit +def test_map_basic_unexpected_rows_accepts_list() -> None: + rows = [{"col_a": 1, "col_b": "x"}, {"col_a": 2, "col_b": "y"}] + m = MapBasicResult(unexpected_rows=rows) + assert m.unexpected_rows == rows + + +@pytest.mark.unit +def test_map_basic_unexpected_rows_accepts_string() -> None: + """unexpected_rows: Any accepts string (e.g., a serialized representation).""" + m = MapBasicResult(unexpected_rows="some-string-representation") + assert m.unexpected_rows == "some-string-representation" + + +@pytest.mark.unit +def test_map_basic_inherits_sql_fields() -> None: + """MapBasicResult inherits SQL-only fields from MapResultBase.""" + m = MapBasicResult( + **_BASIC_RESULT_DATA, + unexpected_index_query="SELECT ...", + unexpected_index_column_names=["pk"], + ) + assert m.unexpected_index_query == "SELECT ..." + assert m.unexpected_index_column_names == ["pk"] + + +@pytest.mark.unit +def test_map_basic_summary_only_field_raises() -> None: + """MapBasicResult does not accept MapSummaryResult-only fields.""" + with pytest.raises(pydantic.ValidationError): + MapBasicResult(**_BASIC_RESULT_DATA, partial_unexpected_counts=[]) + + +# --------------------------------------------------------------------------- +# MapSummaryResult +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_map_summary_parses_valid_result() -> None: + """MapSummaryResult parses a typical SUMMARY result dict correctly.""" + data = {**_BASIC_RESULT_DATA, **_SUMMARY_EXTRA_DATA} + m = MapSummaryResult(**data) + assert m.element_count == 100 + assert m.partial_unexpected_counts == [{"value": 1, "count": 3}] + assert m.partial_unexpected_index_list == [] + + +@pytest.mark.unit +def test_map_summary_all_optional() -> None: + """All fields in MapSummaryResult are Optional.""" + m = MapSummaryResult() + assert m.partial_unexpected_counts is None + assert m.partial_unexpected_index_list is None + + +@pytest.mark.unit +def test_map_summary_extra_field_raises() -> None: + """extra=forbid: unknown fields raise ValidationError in MapSummaryResult.""" + data = {**_BASIC_RESULT_DATA, **_SUMMARY_EXTRA_DATA} + with pytest.raises(pydantic.ValidationError): + MapSummaryResult(**data, unknown_field="bad") + + +@pytest.mark.unit +def test_map_summary_partial_counts_accepts_canonical_shape() -> None: + """partial_unexpected_counts: [{value: x, count: n}, ...] is canonical.""" + counts = [{"value": "foo", "count": 3}, {"value": "bar", "count": 1}] + m = MapSummaryResult(partial_unexpected_counts=counts) + assert m.partial_unexpected_counts == counts + + +@pytest.mark.unit +def test_map_summary_partial_counts_accepts_error_fallback() -> None: + """partial_unexpected_counts: [{"error": "..."}] fallback shape is accepted.""" + fallback = [{"error": "partial_exception_counts requires a hashable type"}] + m = MapSummaryResult(partial_unexpected_counts=fallback) + assert m.partial_unexpected_counts == fallback + + +@pytest.mark.unit +def test_map_summary_partial_counts_accepts_none() -> None: + m = MapSummaryResult(partial_unexpected_counts=None) + assert m.partial_unexpected_counts is None + + +@pytest.mark.unit +def test_map_summary_complete_only_field_raises() -> None: + """MapSummaryResult does not accept MapCompleteResult-only fields.""" + with pytest.raises(pydantic.ValidationError): + MapSummaryResult(**_BASIC_RESULT_DATA, unexpected_list=[1, 2, 3]) + + +# --------------------------------------------------------------------------- +# MapCompleteResult +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_map_complete_parses_valid_result() -> None: + """MapCompleteResult parses a typical COMPLETE result dict correctly.""" + data = {**_BASIC_RESULT_DATA, **_SUMMARY_EXTRA_DATA, **_COMPLETE_EXTRA_DATA} + m = MapCompleteResult(**data) + assert m.element_count == 100 + assert m.partial_unexpected_counts == [{"value": 1, "count": 3}] + assert m.unexpected_list == [1, 2, 3] + assert m.unexpected_index_list == [10, 11, 12] + + +@pytest.mark.unit +def test_map_complete_all_optional() -> None: + """All fields in MapCompleteResult are Optional.""" + m = MapCompleteResult() + assert m.unexpected_list is None + assert m.unexpected_index_list is None + + +@pytest.mark.unit +def test_map_complete_extra_field_raises() -> None: + """extra=forbid: unknown fields raise ValidationError in MapCompleteResult.""" + data = {**_BASIC_RESULT_DATA, **_SUMMARY_EXTRA_DATA, **_COMPLETE_EXTRA_DATA} + with pytest.raises(pydantic.ValidationError): + MapCompleteResult(**data, not_a_real_field="value") + + +@pytest.mark.unit +def test_map_complete_inherits_all_ancestor_fields() -> None: + """MapCompleteResult inherits fields from all ancestor classes.""" + data = { + **_BASIC_RESULT_DATA, + **_SUMMARY_EXTRA_DATA, + **_COMPLETE_EXTRA_DATA, + "unexpected_index_query": "SELECT ...", + "unexpected_index_column_names": ["id"], + } + m = MapCompleteResult(**data) + # From MapResultBase + assert m.unexpected_index_query == "SELECT ..." + assert m.unexpected_index_column_names == ["id"] + # From MapBasicResult + assert m.element_count == 100 + assert m.partial_unexpected_list == [1, 2, 3] + # From MapSummaryResult + assert m.partial_unexpected_counts == [{"value": 1, "count": 3}] + # From MapCompleteResult + assert m.unexpected_list == [1, 2, 3] + assert m.unexpected_index_list == [10, 11, 12] + + +# --------------------------------------------------------------------------- +# root_validate_engine_required_fields (via MapCompleteResult) +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_map_complete_sql_hint_with_query_passes() -> None: + """SQL engine + return_unexpected_index_query=True + query present → passes.""" + m = MapCompleteResult( + engine_hint="sql", + unexpected_index_query="SELECT id FROM table WHERE val > 5", + ) + assert m.unexpected_index_query == "SELECT id FROM table WHERE val > 5" + + +@pytest.mark.unit +def test_map_complete_sql_hint_missing_query_no_raise_without_return_flag() -> None: + """SQL engine + no return_unexpected_index_query flag → validator is a no-op. + + The root_validate_engine_required_fields only raises when BOTH engine_hint='sql' + AND return_unexpected_index_query=True are in the values dict. Since + MapCompleteResult does not declare return_unexpected_index_query as a field, + setting engine_hint='sql' alone does NOT trigger the SQL assertion. + The validator is designed to be composed with the dispatcher, which can + inject additional context via a helper field if needed. + """ + # Should NOT raise: engine_hint='sql' but no return_unexpected_index_query field + m = MapCompleteResult(engine_hint="sql", unexpected_index_query=None) + assert m.engine_hint == "sql" + assert m.unexpected_index_query is None + + +@pytest.mark.unit +def test_map_complete_no_engine_hint_no_query_passes() -> None: + """No engine hint → root validator is a no-op regardless of other fields.""" + m = MapCompleteResult( + unexpected_index_query=None, + ) + assert m.unexpected_index_query is None + + +@pytest.mark.unit +def test_map_complete_pandas_engine_no_query_passes() -> None: + """Non-SQL engine hint → root validator is a no-op.""" + m = MapCompleteResult( + engine_hint="pandas", + unexpected_index_query=None, + ) + assert m.engine_hint == "pandas" + assert m.unexpected_index_query is None + + +# --------------------------------------------------------------------------- +# engine_hint field inheritance +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_engine_hint_present_in_all_variants() -> None: + """engine_hint is declared on MapResultBase and inherited by all variants.""" + assert "engine_hint" in MapResultBase.__fields__ + assert "engine_hint" in MapBooleanOnlyResult.__fields__ + assert "engine_hint" in MapBasicResult.__fields__ + assert "engine_hint" in MapSummaryResult.__fields__ + assert "engine_hint" in MapCompleteResult.__fields__ + + +@pytest.mark.unit +def test_engine_hint_defaults_to_none() -> None: + """engine_hint defaults to None on all variants.""" + assert MapBooleanOnlyResult().engine_hint is None + assert MapBasicResult().engine_hint is None + assert MapSummaryResult().engine_hint is None + assert MapCompleteResult().engine_hint is None + + +# --------------------------------------------------------------------------- +# extra=forbid on MapResultBase +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_map_result_base_extra_forbid() -> None: + """MapResultBase itself also enforces extra=forbid.""" + with pytest.raises(pydantic.ValidationError): + MapResultBase(completely_unknown="value") + + +# --------------------------------------------------------------------------- +# Inheritance chain sanity +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_inheritance_chain() -> None: + """MapCompleteResult → MapSummaryResult → MapBasicResult → MapResultBase.""" + assert issubclass(MapCompleteResult, MapSummaryResult) + assert issubclass(MapSummaryResult, MapBasicResult) + assert issubclass(MapBasicResult, MapResultBase) + assert issubclass(MapBooleanOnlyResult, MapResultBase) + + +@pytest.mark.unit +def test_map_complete_is_not_map_boolean_only() -> None: + """MapCompleteResult and MapBooleanOnlyResult are separate leaf classes.""" + assert not issubclass(MapCompleteResult, MapBooleanOnlyResult) + assert not issubclass(MapBooleanOnlyResult, MapCompleteResult) diff --git a/tests/unit/core/validation_result_schemas/test_schemas_overrides.py b/tests/unit/core/validation_result_schemas/test_schemas_overrides.py new file mode 100644 index 000000000000..6db8a7391eb4 --- /dev/null +++ b/tests/unit/core/validation_result_schemas/test_schemas_overrides.py @@ -0,0 +1,86 @@ +"""Unit tests for per-expectation schema overrides. + +Covers: +- expect_column_values_to_be_of_type pandas-path payload matches MapBasicResult, + NOT ExpectColumnValuesToBeOfTypeSqlSparkResult. +- expect_column_values_to_be_of_type SQL/Spark-path payload matches the override. +- Extra fields on the override raise pydantic.ValidationError (extra=forbid). + +All tests are marked @pytest.mark.unit and run via: + pytest tests/unit/core/validation_result_schemas/test_schemas_overrides.py -m unit +""" + +from __future__ import annotations + +import pytest + +from great_expectations.compatibility import pydantic +from great_expectations.core.validation_result_schemas.schemas.map_result import ( + MapBasicResult, +) +from great_expectations.core.validation_result_schemas.schemas.per_expectation_overrides import ( + ExpectColumnValuesToBeOfTypeSqlSparkResult, +) + +# --------------------------------------------------------------------------- +# Pandas-path: expect_column_values_to_be_of_type emits a map-shaped result +# --------------------------------------------------------------------------- + +_PANDAS_RESULT = { + "element_count": 10, + "unexpected_count": 0, + "unexpected_percent": 0.0, + "partial_unexpected_list": [], +} + + +@pytest.mark.unit +def test_pandas_path_parses_as_map_basic_result() -> None: + """The pandas result for expect_column_values_to_be_of_type is map-shaped. + + It must parse as MapBasicResult, confirming it belongs to the Map family. + """ + m = MapBasicResult(**_PANDAS_RESULT) + assert m.element_count == 10 + assert m.unexpected_count == 0 + assert m.unexpected_percent == 0.0 + assert m.partial_unexpected_list == [] + + +# --------------------------------------------------------------------------- +# SQL/Spark-path: expect_column_values_to_be_of_type emits {observed_value: ...} +# --------------------------------------------------------------------------- + +_SQL_SPARK_RESULT = {"observed_value": "str"} + + +@pytest.mark.unit +def test_sql_spark_path_parses_as_override() -> None: + """The SQL/Spark result for expect_column_values_to_be_of_type matches the override. + + SQL/Spark bypasses _format_map_output and emits only {observed_value: }. + """ + r = ExpectColumnValuesToBeOfTypeSqlSparkResult(**_SQL_SPARK_RESULT) + assert r.observed_value == "str" + + +@pytest.mark.unit +def test_sql_spark_path_observed_value_preserved() -> None: + """observed_value carries the type name string verbatim.""" + r = ExpectColumnValuesToBeOfTypeSqlSparkResult(observed_value="INTEGER") + assert r.observed_value == "INTEGER" + + +# --------------------------------------------------------------------------- +# extra=forbid: unknown fields on the override must raise +# --------------------------------------------------------------------------- + + +@pytest.mark.unit +def test_override_extra_field_raises() -> None: + """ExpectColumnValuesToBeOfTypeSqlSparkResult rejects unknown extra fields.""" + with pytest.raises(pydantic.ValidationError): + ExpectColumnValuesToBeOfTypeSqlSparkResult( + observed_value="int", + unexpected_extra="x", + )