Skip to content

[MAINTENANCE] Databricks data type expectation tests #10734

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
from datetime import datetime, timezone

import pandas as pd
import pytest

from great_expectations.compatibility.sqlalchemy import sqltypes
from great_expectations.expectations import (
ExpectColumnDistinctValuesToContainSet,
ExpectColumnSumToBeBetween,
ExpectColumnValuesToBeBetween,
ExpectColumnValuesToBeInSet,
ExpectColumnValuesToBeOfType,
)
from tests.integration.test_utils.data_source_config import DatabricksDatasourceTestConfig
from tests.integration.test_utils.data_source_config.databricks import DatabricksBatchTestSetup

pytestmark = pytest.mark.databricks


class TestDatabricksDataTypes:
"""This set of tests ensures that we can run expectations against every data
type supported by Databricks.

https://docs.databricks.com/en/sql/language-manual/sql-ref-datatypes.html
"""

BOOL_COL_NAME = "my_bool"
DATE_COL_NAME = "my_date"
NUMERIC_COL_NAME = "my_number"
STRING_COL_NAME = "my_string"

DATA_FRAME = pd.DataFrame(
{
BOOL_COL_NAME: [True, False, True, False],
DATE_COL_NAME: [
datetime(2021, 1, 1, tzinfo=timezone.utc).date(),
datetime(2021, 1, 2, tzinfo=timezone.utc).date(),
datetime(2021, 1, 3, tzinfo=timezone.utc).date(),
datetime(2021, 1, 4, tzinfo=timezone.utc).date(),
],
NUMERIC_COL_NAME: [1, 2, 3, 4],
STRING_COL_NAME: ["a", "b", "c", "d"],
}
)

@pytest.mark.parametrize(
"column_type",
[
sqltypes.SMALLINT,
sqltypes.INT,
sqltypes.BIGINT,
sqltypes.DECIMAL,
sqltypes.FLOAT,
sqltypes.REAL,
],
)
def test_number(self, column_type: sqltypes.TypeEngine):
batch_setup = DatabricksBatchTestSetup(
config=DatabricksDatasourceTestConfig(
column_types={self.NUMERIC_COL_NAME: column_type}
),
data=self.DATA_FRAME,
extra_data={},
)
with batch_setup.batch_test_context() as batch:
result = batch.validate(
expect=ExpectColumnSumToBeBetween(
column=self.NUMERIC_COL_NAME,
min_value=9,
max_value=11,
)
)
assert result.success

def test_varchar(self):
column_type = sqltypes.String
batch_setup = DatabricksBatchTestSetup(
config=DatabricksDatasourceTestConfig(column_types={self.STRING_COL_NAME: column_type}),
data=self.DATA_FRAME,
extra_data={},
)
with batch_setup.batch_test_context() as batch:
result = batch.validate(
expect=ExpectColumnDistinctValuesToContainSet(
column=self.STRING_COL_NAME,
value_set=[
"a",
"b",
],
)
)
assert result.success

def test_boolean(self):
column_type = sqltypes.BOOLEAN
batch_setup = DatabricksBatchTestSetup(
config=DatabricksDatasourceTestConfig(column_types={self.BOOL_COL_NAME: column_type}),
data=self.DATA_FRAME,
extra_data={},
)
with batch_setup.batch_test_context() as batch:
result = batch.validate(
expect=ExpectColumnValuesToBeInSet(
column=self.BOOL_COL_NAME, value_set=[True, False]
)
)
assert result.success

@pytest.mark.xfail(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there a ticket for this?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

strict=True,
reason="Expectation fails despite the type_ param being the same as the observed value.",
)
def test_boolean__expect_column_values_to_be_of_type(self):
column_type = sqltypes.BOOLEAN
batch_setup = DatabricksBatchTestSetup(
config=DatabricksDatasourceTestConfig(column_types={self.BOOL_COL_NAME: column_type}),
data=self.DATA_FRAME,
extra_data={},
)
with batch_setup.batch_test_context() as batch:
result = batch.validate(
expect=ExpectColumnValuesToBeOfType(column=self.BOOL_COL_NAME, type_="Boolean")
)
assert result.success

def test_date(self):
column_type = sqltypes.DATE
batch_setup = DatabricksBatchTestSetup(
config=DatabricksDatasourceTestConfig(column_types={self.DATE_COL_NAME: column_type}),
data=self.DATA_FRAME,
extra_data={},
)
with batch_setup.batch_test_context() as batch:
result = batch.validate(
expect=ExpectColumnValuesToBeBetween(
column=self.DATE_COL_NAME,
min_value=datetime(year=2021, month=1, day=1, tzinfo=timezone.utc).date(),
max_value=datetime(year=2024, month=1, day=1, tzinfo=timezone.utc).date(),
)
)
assert result.success

@pytest.mark.parametrize(
"column_type",
[
sqltypes.TIMESTAMP(timezone=False),
sqltypes.TIMESTAMP(timezone=True),
],
)
def test_timestamp(self, column_type: sqltypes.TIMESTAMP):
batch_setup = DatabricksBatchTestSetup(
config=DatabricksDatasourceTestConfig(column_types={self.DATE_COL_NAME: column_type}),
data=self.DATA_FRAME,
extra_data={},
)
with batch_setup.batch_test_context() as batch:
result = batch.validate(
expect=ExpectColumnValuesToBeBetween(
column=self.DATE_COL_NAME,
min_value=datetime(year=2021, month=1, day=1, tzinfo=timezone.utc),
max_value=datetime(year=2024, month=1, day=1, tzinfo=timezone.utc),
)
)
assert result.success
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
from tests.integration.test_utils.data_source_config import SnowflakeDatasourceTestConfig
from tests.integration.test_utils.data_source_config.snowflake import SnowflakeBatchTestSetup

pytestmark = pytest.mark.snowflake
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that's the good stuff!



class TestSnowflakeDataTypes:
"""This set of tests ensures that we can run expectations against every data
Expand All @@ -24,7 +26,6 @@ class TestSnowflakeDataTypes:

COLUMN = "col_a"

@pytest.mark.snowflake
def test_number(self):
column_type = (
SNOWFLAKE_TYPES.NUMBER
Expand All @@ -44,7 +45,6 @@ def test_number(self):
)
assert result.success

@pytest.mark.snowflake
def test_int(self):
column_type = sqltypes.INT # equivalent to INTEGER, BIGINT, SMALLINT, TINYINT, BYTEINT
batch_setup = SnowflakeBatchTestSetup(
Expand All @@ -62,7 +62,6 @@ def test_int(self):
)
assert result.success

@pytest.mark.snowflake
def test_float(self):
column_type = sqltypes.FLOAT # equivalent to FLOAT4, FLOAT8, DOUBLE, DOUBLE PRECISION, REAL
batch_setup = SnowflakeBatchTestSetup(
Expand All @@ -80,7 +79,6 @@ def test_float(self):
)
assert result.success

@pytest.mark.snowflake
def test_varchar(self):
column_type = sqltypes.VARCHAR # equivalent to STRING, TEXT
batch_setup = SnowflakeBatchTestSetup(
Expand All @@ -100,7 +98,6 @@ def test_varchar(self):
)
assert result.success

@pytest.mark.snowflake
def test_char(self):
column_type = sqltypes.CHAR # length of 1, equivalent to CHARACTER
batch_setup = SnowflakeBatchTestSetup(
Expand All @@ -120,7 +117,6 @@ def test_char(self):
)
assert result.success

@pytest.mark.snowflake
def test_boolean(self):
column_type = sqltypes.BOOLEAN
batch_setup = SnowflakeBatchTestSetup(
Expand All @@ -134,7 +130,6 @@ def test_boolean(self):
)
assert result.success

@pytest.mark.snowflake
def test_date(self):
column_type = sqltypes.DATE
batch_setup = SnowflakeBatchTestSetup(
Expand All @@ -160,7 +155,6 @@ def test_date(self):
)
assert result.success

@pytest.mark.snowflake
def test_datetime(self):
column_type = sqltypes.DATETIME
batch_setup = SnowflakeBatchTestSetup(
Expand All @@ -186,7 +180,6 @@ def test_datetime(self):
)
assert result.success

@pytest.mark.snowflake
def test_timestamp_tz(self):
column_type = SNOWFLAKE_TYPES.TIMESTAMP_TZ
batch_setup = SnowflakeBatchTestSetup(
Expand All @@ -212,7 +205,6 @@ def test_timestamp_tz(self):
)
assert result.success

@pytest.mark.snowflake
def test_timestamp_ntz(self):
column_type = SNOWFLAKE_TYPES.TIMESTAMP_NTZ
batch_setup = SnowflakeBatchTestSetup(
Expand All @@ -238,7 +230,6 @@ def test_timestamp_ntz(self):
)
assert result.success

@pytest.mark.snowflake
@pytest.mark.xfail(
strict=True,
reason="time is not an accepted min/max value parameter, and other date types "
Expand Down
Loading