- 
                Notifications
    
You must be signed in to change notification settings  - Fork 573
 
Open
Labels
bugSomething isn't workingSomething isn't working
Description
Describe the bug
It seems to be a regression caused by #586
To Reproduce
In local environment, the bug can be easily reproduced with the following
import com.amazon.deequ.checks.{Check, CheckLevel}
import com.amazon.deequ.{VerificationResult, VerificationSuite}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SparkSession}
object HistogramChecker {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder
      .master("local")
      .appName("test")
      .getOrCreate()
    val schema = StructType(
      Seq(
        StructField("id", StringType, nullable = false),
        StructField("count", IntegerType, nullable = false),
        StructField("dt", StringType, nullable = false)
      )
    )
    val rows = Seq(
      Row("id_0", 1, "20240118"),
      Row("id_1", 2, "20240118"),
      Row("id_2", 3, "20240118")
    )
    val df = spark.createDataFrame(
      spark.sparkContext.parallelize(rows),
      schema
    )
    val check = Check(CheckLevel.Error, "Distinct value checks")
      .hasNumberOfDistinctValues("count", _ == 3)
    val result = VerificationSuite()
      .onData(df)
      .addCheck(check)
      .run()
    println(result)
    VerificationResult.checkResultsAsDataFrame(spark, result).show(false)
    spark.stop()
  }
}
The error will be
VerificationResult(Error,Map(Check(Error,Distinct value checks,List(HistogramBinConstraint(Histogram(count,None,1000,None,false,Count)))) -> CheckResult(Check(Error,Distinct value checks,List(HistogramBinConstraint(Histogram(count,None,1000,None,false,Count)))),Error,List(ConstraintResult(HistogramBinConstraint(Histogram(count,None,1000,None,false,Count)),Failure,Some([UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `com_amazon_deequ_dq_metrics_count` cannot be resolved. Did you mean one of the following? [`com_amazon_deequ_dq_metrics_count`, `com_amazon_deequ_dq_metrics_count`].;
'Sort ['com_amazon_deequ_dq_metrics_count DESC NULLS LAST], true
+- Project [count#12 AS com_amazon_deequ_dq_metrics_count#19, count#16L AS com_amazon_deequ_dq_metrics_count#20L]
   +- Aggregate [count#12], [count#12, count(1) AS count#16L]
      +- Project [coalesce(count#9, cast(NullValue as string)) AS count#12]
         +- Project [cast(count#4 as string) AS count#9]
            +- LogicalRDD [id#3, count#4, dt#5], false
),Some(HistogramMetric(count,Failure(org.apache.spark.sql.AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `com_amazon_deequ_dq_metrics_count` cannot be resolved. Did you mean one of the following? [`com_amazon_deequ_dq_metrics_count`, `com_amazon_deequ_dq_metrics_count`].;
'Sort ['com_amazon_deequ_dq_metrics_count DESC NULLS LAST], true
+- Project [count#12 AS com_amazon_deequ_dq_metrics_count#19, count#16L AS com_amazon_deequ_dq_metrics_count#20L]
   +- Aggregate [count#12], [count#12, count(1) AS count#16L]
      +- Project [coalesce(count#9, cast(NullValue as string)) AS count#12]
         +- Project [cast(count#4 as string) AS count#9]
            +- LogicalRDD [id#3, count#4, dt#5], false
))))))),Map(Histogram(count,None,1000,None,false,Count) -> HistogramMetric(count,Failure(org.apache.spark.sql.AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `com_amazon_deequ_dq_metrics_count` cannot be resolved. Did you mean one of the following? [`com_amazon_deequ_dq_metrics_count`, `com_amazon_deequ_dq_metrics_count`].;
'Sort ['com_amazon_deequ_dq_metrics_count DESC NULLS LAST], true
+- Project [count#12 AS com_amazon_deequ_dq_metrics_count#19, count#16L AS com_amazon_deequ_dq_metrics_count#20L]
   +- Aggregate [count#12], [count#12, count(1) AS count#16L]
      +- Project [coalesce(count#9, cast(NullValue as string)) AS count#12]
         +- Project [cast(count#4 as string) AS count#9]
            +- LogicalRDD [id#3, count#4, dt#5], false
))))
The content of checkResultsAsDataFrame is as below:
+---------------------+-----------+------------+-------------------------------------------------------------------+-----------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|check                |check_level|check_status|constraint                                                         |constraint_status|constraint_message                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+---------------------+-----------+------------+-------------------------------------------------------------------+-----------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|Distinct value checks|Error      |Error       |HistogramBinConstraint(Histogram(count,None,1000,None,false,Count))|Failure          |[UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `com_amazon_deequ_dq_metrics_count` cannot be resolved. Did you mean one of the following? [`com_amazon_deequ_dq_metrics_count`, `com_amazon_deequ_dq_metrics_count`].;\n'Sort ['com_amazon_deequ_dq_metrics_count DESC NULLS LAST], true\n+- Project [count#12 AS com_amazon_deequ_dq_metrics_count#19, count#16L AS com_amazon_deequ_dq_metrics_count#20L]\n   +- Aggregate [count#12], [count#12, count(1) AS count#16L]\n      +- Project [coalesce(count#9, cast(NullValue as string)) AS count#12]\n         +- Project [cast(count#4 as string) AS count#9]\n            +- LogicalRDD [id#3, count#4, dt#5], false\n|
+---------------------+-----------+------------+-------------------------------------------------------------------+-----------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
Expected behavior
hasNumberOfDistinctValues Check should work for DF containing columns with special names (e.g: count, sum)
Screenshots
If applicable, add screenshots to help explain your problem.
Additional context
Checked and confirmed that this only happens for 2.0.8 and above.
(I have tested with 2.0.8-spark-3.4)
huymq1710
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working