Skip to content

Commit c8c8d82

Browse files
[BUGFIX] ensure unexpected_rows are included if requested (#11358)
1 parent b5950fa commit c8c8d82

8 files changed

+205
-5
lines changed

great_expectations/expectations/core/expect_column_values_to_be_null.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -367,8 +367,16 @@ def _validate(
367367

368368
nonnull_count = None
369369

370+
# Handle unexpected_rows for include_unexpected_rows feature
371+
parsed_result_format = parse_result_format(result_format)
372+
unexpected_rows = None
373+
if parsed_result_format.get("include_unexpected_rows", False):
374+
unexpected_rows = metrics.get(
375+
f"{self.map_metric}.{SummarizationMetricNameSuffixes.UNEXPECTED_ROWS.value}"
376+
)
377+
370378
return _format_map_output(
371-
result_format=parse_result_format(result_format),
379+
result_format=parsed_result_format,
372380
success=success,
373381
element_count=metrics.get("table.row_count"),
374382
nonnull_count=nonnull_count,
@@ -384,4 +392,5 @@ def _validate(
384392
unexpected_index_query=metrics.get(
385393
f"{self.map_metric}.{SummarizationMetricNameSuffixes.UNEXPECTED_INDEX_QUERY.value}"
386394
),
395+
unexpected_rows=unexpected_rows,
387396
)

great_expectations/expectations/core/expect_column_values_to_not_be_null.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -406,8 +406,16 @@ def _validate(
406406

407407
nonnull_count = None
408408

409+
# Handle unexpected_rows for include_unexpected_rows feature
410+
parsed_result_format = parse_result_format(result_format)
411+
unexpected_rows = None
412+
if parsed_result_format.get("include_unexpected_rows", False):
413+
unexpected_rows = metrics.get(
414+
f"{self.map_metric}.{SummarizationMetricNameSuffixes.UNEXPECTED_ROWS.value}"
415+
)
416+
409417
return _format_map_output(
410-
result_format=parse_result_format(result_format),
418+
result_format=parsed_result_format,
411419
success=success,
412420
element_count=metrics.get("table.row_count"),
413421
nonnull_count=nonnull_count,
@@ -423,4 +431,5 @@ def _validate(
423431
unexpected_index_query=metrics.get(
424432
f"{self.map_metric}.{SummarizationMetricNameSuffixes.UNEXPECTED_INDEX_QUERY.value}"
425433
),
434+
unexpected_rows=unexpected_rows,
426435
)

great_expectations/expectations/expectation.py

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2335,9 +2335,14 @@ def _validate(
23352335
self._get_result_format(runtime_configuration=runtime_configuration)
23362336
)
23372337

2338+
include_unexpected_rows: bool
23382339
unexpected_index_column_names = None
23392340
if isinstance(result_format, dict):
2341+
include_unexpected_rows = bool(result_format.get("include_unexpected_rows", False))
23402342
unexpected_index_column_names = result_format.get("unexpected_index_column_names", None)
2343+
else:
2344+
include_unexpected_rows = False
2345+
23412346
total_count: Optional[int] = metrics.get("table.row_count")
23422347
unexpected_count: Optional[int] = metrics.get(
23432348
f"{self.map_metric}.{SummarizationMetricNameSuffixes.UNEXPECTED_COUNT.value}"
@@ -2355,6 +2360,12 @@ def _validate(
23552360
f"{self.map_metric}.{SummarizationMetricNameSuffixes.FILTERED_ROW_COUNT.value}"
23562361
)
23572362

2363+
unexpected_rows = None
2364+
if include_unexpected_rows:
2365+
unexpected_rows = metrics.get(
2366+
f"{self.map_metric}.{SummarizationMetricNameSuffixes.UNEXPECTED_ROWS.value}"
2367+
)
2368+
23582369
if (
23592370
total_count is None
23602371
or unexpected_count is None
@@ -2381,6 +2392,7 @@ def _validate(
23812392
unexpected_index_list=unexpected_index_list,
23822393
unexpected_index_query=unexpected_index_query,
23832394
unexpected_index_column_names=unexpected_index_column_names,
2395+
unexpected_rows=unexpected_rows,
23842396
)
23852397

23862398

@@ -2540,9 +2552,6 @@ def get_validation_dependencies(
25402552
),
25412553
)
25422554

2543-
if result_format_str == ResultFormat.BASIC:
2544-
return validation_dependencies
2545-
25462555
if include_unexpected_rows:
25472556
metric_kwargs = get_metric_kwargs(
25482557
metric_name=f"{self.map_metric}.{SummarizationMetricNameSuffixes.UNEXPECTED_ROWS.value}",
@@ -2558,6 +2567,9 @@ def get_validation_dependencies(
25582567
),
25592568
)
25602569

2570+
if result_format_str == ResultFormat.BASIC:
2571+
return validation_dependencies
2572+
25612573
from great_expectations.execution_engine import (
25622574
SqlAlchemyExecutionEngine,
25632575
)
@@ -2605,9 +2617,14 @@ def _validate(
26052617
execution_engine: Optional[ExecutionEngine] = None,
26062618
):
26072619
result_format = self._get_result_format(runtime_configuration=runtime_configuration)
2620+
2621+
include_unexpected_rows: bool
26082622
unexpected_index_column_names = None
26092623
if isinstance(result_format, dict):
2624+
include_unexpected_rows = bool(result_format.get("include_unexpected_rows", False))
26102625
unexpected_index_column_names = result_format.get("unexpected_index_column_names", None)
2626+
else:
2627+
include_unexpected_rows = False
26112628

26122629
total_count: Optional[int] = metrics.get("table.row_count")
26132630
unexpected_count: Optional[int] = metrics.get(
@@ -2626,6 +2643,12 @@ def _validate(
26262643
f"{self.map_metric}.{SummarizationMetricNameSuffixes.UNEXPECTED_INDEX_QUERY.value}"
26272644
)
26282645

2646+
unexpected_rows = None
2647+
if include_unexpected_rows:
2648+
unexpected_rows = metrics.get(
2649+
f"{self.map_metric}.{SummarizationMetricNameSuffixes.UNEXPECTED_ROWS.value}"
2650+
)
2651+
26292652
if (
26302653
total_count is None
26312654
or unexpected_count is None
@@ -2652,6 +2675,7 @@ def _validate(
26522675
unexpected_index_list=unexpected_index_list,
26532676
unexpected_index_query=unexpected_index_query,
26542677
unexpected_index_column_names=unexpected_index_column_names,
2678+
unexpected_rows=unexpected_rows,
26552679
)
26562680

26572681

tests/integration/data_sources_and_expectations/expectations/test_expect_column_pair_values_to_be_equal.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from datetime import datetime
2+
from typing import Any, Dict, cast
23
from unittest.mock import ANY
34

45
import pandas as pd
@@ -220,3 +221,32 @@ def test_success_with_suite_param_ignore_row_if_(
220221
expectation, expectation_parameters={suite_param_key: suite_param_value}
221222
)
222223
assert result.success == expected_result
224+
225+
226+
@parameterize_batch_for_data_sources(data_source_configs=JUST_PANDAS_DATA_SOURCES, data=DATA)
227+
def test_include_unexpected_rows(batch_for_datasource: Batch) -> None:
228+
"""Test that include_unexpected_rows works correctly for ExpectColumnPairValuesToBeEqual."""
229+
expectation = gxe.ExpectColumnPairValuesToBeEqual(
230+
column_A=EQUAL_STRINGS_A, column_B=UNEQUAL_STRINGS
231+
)
232+
result = batch_for_datasource.validate(
233+
expectation, result_format={"result_format": "BASIC", "include_unexpected_rows": True}
234+
)
235+
236+
assert not result.success
237+
result_dict = cast("Dict[str, Any]", result.to_json_dict()["result"])
238+
239+
# Verify that unexpected_rows is present and contains the expected data
240+
assert "unexpected_rows" in result_dict
241+
assert result_dict["unexpected_rows"] is not None
242+
243+
# Convert to DataFrame for easier comparison
244+
unexpected_rows_data = result_dict["unexpected_rows"]
245+
assert isinstance(unexpected_rows_data, list)
246+
unexpected_rows_df = pd.DataFrame(unexpected_rows_data)
247+
248+
# Should contain 1 row where column_A != column_B
249+
assert len(unexpected_rows_df) == 1
250+
251+
# The unexpected row should have different values in column_A and column_B
252+
assert unexpected_rows_df.loc[0, EQUAL_STRINGS_A] != unexpected_rows_df.loc[0, UNEQUAL_STRINGS]

tests/integration/data_sources_and_expectations/expectations/test_expect_column_values_to_be_null.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from typing import Any, Dict, cast
12
from unittest.mock import ANY
23

34
import pandas as pd
@@ -112,3 +113,34 @@ def test_failure(
112113
) -> None:
113114
result = batch_for_datasource.validate(expectation)
114115
assert not result.success
116+
117+
118+
@parameterize_batch_for_data_sources(data_source_configs=JUST_PANDAS_DATA_SOURCES, data=DATA)
119+
def test_include_unexpected_rows_pandas(batch_for_datasource: Batch) -> None:
120+
"""Test that include_unexpected_rows works correctly for ExpectColumnValuesToBeNull."""
121+
expectation = gxe.ExpectColumnValuesToBeNull(column=MOSTLY_NULL_COLUMN)
122+
result = batch_for_datasource.validate(
123+
expectation, result_format={"result_format": "BASIC", "include_unexpected_rows": True}
124+
)
125+
126+
assert not result.success
127+
result_dict = cast("Dict[str, Any]", result.to_json_dict()["result"])
128+
129+
# Verify that unexpected_rows is present and contains the expected data
130+
assert "unexpected_rows" in result_dict
131+
assert result_dict["unexpected_rows"] is not None
132+
133+
# Convert to DataFrame for easier comparison
134+
unexpected_rows_data = result_dict["unexpected_rows"]
135+
assert isinstance(unexpected_rows_data, list)
136+
unexpected_rows_df = pd.DataFrame(unexpected_rows_data)
137+
138+
# Should contain 1 row where MOSTLY_NULL_COLUMN is not null (index 0 with value 1)
139+
assert len(unexpected_rows_df) == 1
140+
assert list(unexpected_rows_df.index) == [0]
141+
142+
# The unexpected row should have value 1 in MOSTLY_NULL_COLUMN
143+
assert unexpected_rows_df.loc[0, MOSTLY_NULL_COLUMN] == 1
144+
145+
# ALL_NULL_COLUMN should be null in the unexpected row
146+
assert pd.isna(unexpected_rows_df.loc[0, ALL_NULL_COLUMN])

tests/integration/data_sources_and_expectations/expectations/test_expect_column_values_to_not_be_null.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from typing import Any, Dict, cast
12
from unittest.mock import ANY
23

34
import pandas as pd
@@ -174,3 +175,35 @@ def test_failure(
174175
) -> None:
175176
result = batch_for_datasource.validate(expectation)
176177
assert not result.success
178+
179+
180+
@parameterize_batch_for_data_sources(data_source_configs=JUST_PANDAS_DATA_SOURCES, data=DATA)
181+
def test_include_unexpected_rows_pandas(batch_for_datasource: Batch) -> None:
182+
"""Test that include_unexpected_rows works correctly for ExpectColumnValuesToNotBeNull."""
183+
expectation = gxe.ExpectColumnValuesToNotBeNull(column=MOSTLY_NULL_COLUMN)
184+
result = batch_for_datasource.validate(
185+
expectation, result_format={"result_format": "BASIC", "include_unexpected_rows": True}
186+
)
187+
188+
assert not result.success
189+
result_dict = cast("Dict[str, Any]", result.to_json_dict()["result"])
190+
191+
# Verify that unexpected_rows is present and contains the expected data
192+
assert "unexpected_rows" in result_dict
193+
assert result_dict["unexpected_rows"] is not None
194+
195+
# Convert to DataFrame for easier comparison
196+
unexpected_rows_data = result_dict["unexpected_rows"]
197+
assert isinstance(unexpected_rows_data, list)
198+
unexpected_rows_df = pd.DataFrame(unexpected_rows_data)
199+
200+
# Should contain 4 rows where MOSTLY_NULL_COLUMN is null
201+
assert len(unexpected_rows_df) == 4
202+
203+
# All values in the MOSTLY_NULL_COLUMN should be null in the unexpected rows
204+
assert unexpected_rows_df[MOSTLY_NULL_COLUMN].isnull().all()
205+
206+
# Other columns should have their original values (rows with indices 1,2,3,4)
207+
# In the unexpected_rows result, these get re-indexed starting from 0
208+
assert list(unexpected_rows_df[NON_NULL_COLUMN]) == [2, 3, 4, 5]
209+
assert list(unexpected_rows_df[ALL_NULL_COLUMN].isnull()) == [True, True, True, True]

tests/integration/data_sources_and_expectations/expectations/test_expect_multicolumn_sum_to_equal.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from typing import Any, Dict, cast
2+
13
import pandas as pd
24
import pytest
35

@@ -114,3 +116,31 @@ def test_success_with_suite_param_ignore_row_if_(
114116
expectation, expectation_parameters={suite_param_key: suite_param_value}
115117
)
116118
assert result.success == expected_result
119+
120+
121+
@parameterize_batch_for_data_sources(data_source_configs=JUST_PANDAS_DATA_SOURCES, data=DATA)
122+
def test_include_unexpected_rows(batch_for_datasource: Batch) -> None:
123+
"""Test that include_unexpected_rows works correctly for ExpectMulticolumnSumToEqual."""
124+
expectation = gxe.ExpectMulticolumnSumToEqual(column_list=[COL_A, COL_B], sum_total=10)
125+
result = batch_for_datasource.validate(
126+
expectation, result_format={"result_format": "BASIC", "include_unexpected_rows": True}
127+
)
128+
129+
assert not result.success
130+
result_dict = cast("Dict[str, Any]", result.to_json_dict()["result"])
131+
132+
# Verify that unexpected_rows is present and contains the expected data
133+
assert "unexpected_rows" in result_dict
134+
assert result_dict["unexpected_rows"] is not None
135+
136+
# Convert to DataFrame for easier comparison
137+
unexpected_rows_data = result_dict["unexpected_rows"]
138+
assert isinstance(unexpected_rows_data, list)
139+
unexpected_rows_df = pd.DataFrame(unexpected_rows_data)
140+
141+
# Should contain multiple rows where sum != 10
142+
assert len(unexpected_rows_df) > 0
143+
144+
# Check that the rows contain the expected columns
145+
assert COL_A in unexpected_rows_df.columns
146+
assert COL_B in unexpected_rows_df.columns

tests/integration/data_sources_and_expectations/expectations/test_expect_select_column_values_to_be_unique_within_record.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from typing import Any, Dict, cast
2+
13
import pandas as pd
24
import pytest
35

@@ -112,3 +114,34 @@ def test_success_with_suite_param_ignore_row_if_(
112114
expectation, expectation_parameters={suite_param_key: suite_param_value}
113115
)
114116
assert result.success == expected_result
117+
118+
119+
@parameterize_batch_for_data_sources(data_source_configs=JUST_PANDAS_DATA_SOURCES, data=DATA)
120+
def test_include_unexpected_rows(batch_for_datasource: Batch) -> None:
121+
"""Test include_unexpected_rows for ExpectSelectColumnValuesToBeUniqueWithinRecord."""
122+
expectation = gxe.ExpectSelectColumnValuesToBeUniqueWithinRecord(
123+
column_list=[INT_COL_A, INT_COL_B, INT_COL_C]
124+
)
125+
result = batch_for_datasource.validate(
126+
expectation, result_format={"result_format": "BASIC", "include_unexpected_rows": True}
127+
)
128+
129+
assert not result.success
130+
result_dict = cast("Dict[str, Any]", result.to_json_dict()["result"])
131+
132+
# Verify that unexpected_rows is present and contains the expected data
133+
assert "unexpected_rows" in result_dict
134+
assert result_dict["unexpected_rows"] is not None
135+
136+
# Convert to DataFrame for easier comparison
137+
unexpected_rows_data = result_dict["unexpected_rows"]
138+
assert isinstance(unexpected_rows_data, list)
139+
unexpected_rows_df = pd.DataFrame(unexpected_rows_data)
140+
141+
# Should contain rows that have non-unique values in the selected columns
142+
assert len(unexpected_rows_df) > 0
143+
144+
# Check that the rows contain the expected columns
145+
assert INT_COL_A in unexpected_rows_df.columns
146+
assert INT_COL_B in unexpected_rows_df.columns
147+
assert INT_COL_C in unexpected_rows_df.columns

0 commit comments

Comments
 (0)