[SC-33980][WARMFIX][DELTA] Fix incorrect metrics in Delete/Update and add more tests

rahulsmahadev · tdas · commit 1320bf99263c · 2020-05-21T01:47:14.000-04:00
## What changes were proposed in this pull request?
- Fix inccorect way of capturing number of copied rows - copied rows were incorrectly computed based on scanned files. Changing this to use the computed write stats in DeleteCommand. For UpdateCommand we now use the udf in the right place.

- Fix incorrect way of capturing number of removed files

## How was this patch tested?
- added more tests
- changed existing tests

Author: Rahul Mahadev &lt;rahul.mahadev@databricks.com&gt;

#9576 is resolved by rahulsmahadev/rowLevelHistoryFix.

GitOrigin-RevId: 83b991e4952a263549e1de2733003885c89737a9
diff --git a/src/main/scala/org/apache/spark/sql/delta/DeltaOperations.scala b/src/main/scala/org/apache/spark/sql/delta/DeltaOperations.scala
@@ -75,13 +75,12 @@ object DeltaOperations {
     override val operationMetrics: Set[String] = DeltaOperationMetrics.DELETE
 
     override def transformMetrics(metrics: Map[String, SQLMetric]): Map[String, String] = {
-      // find the case where deletedRows are not captured
-      val numTotalRows = metrics("numTotalRows").value
       var strMetrics = super.transformMetrics(metrics)
-      strMetrics += "numCopiedRows" -> (numTotalRows -
-        metrics("numDeletedRows").value).toString
-      if (strMetrics("numDeletedRows") == "0" && strMetrics("numCopiedRows") == "0" &&
-        strMetrics("numRemovedFiles") != "0") {
+      if (metrics.contains("numOutputRows")) {
+        strMetrics += "numCopiedRows" -> metrics("numOutputRows").value.toString
+      }
+      // find the case where deletedRows are not captured
+      if (strMetrics("numDeletedRows") == "0" && strMetrics("numRemovedFiles") != "0") {
         // identify when row level metrics are unavailable. This will happen when the entire
         // table or partition are deleted.
         strMetrics -= "numDeletedRows"
@@ -149,15 +148,18 @@ object DeltaOperations {
     override val operationMetrics: Set[String] = DeltaOperationMetrics.UPDATE
 
     override def transformMetrics(metrics: Map[String, SQLMetric]): Map[String, String] = {
-      val numTotalRows = metrics("numTotalRows").value
       val numOutputRows = metrics("numOutputRows").value
       val numUpdatedRows = metrics("numUpdatedRows").value
       var strMetrics = super.transformMetrics(metrics)
-      strMetrics += "numCopiedRows" -> (numTotalRows - numUpdatedRows).toString
       // In the case where the numUpdatedRows is not captured in the UpdateCommand implementation
       // we can siphon out the metrics from the BasicWriteStatsTracker for that command.
-      if(numTotalRows == 0 && numUpdatedRows == 0 && numOutputRows != 0) {
+      // This is for the case where the entire partition is re-written.
+      if (numUpdatedRows == 0 && numOutputRows != 0) {
         strMetrics += "numUpdatedRows" -> numOutputRows.toString
+        strMetrics += "numCopiedRows" -> "0"
+      } else {
+        strMetrics += "numCopiedRows" -> (
+          numOutputRows - strMetrics("numUpdatedRows").toInt).toString
       }
       strMetrics
     }
diff --git a/src/main/scala/org/apache/spark/sql/delta/commands/DeleteCommand.scala b/src/main/scala/org/apache/spark/sql/delta/commands/DeleteCommand.scala
@@ -56,8 +56,7 @@ case class DeleteCommand(
   override lazy val metrics = Map[String, SQLMetric](
     "numRemovedFiles" -> createMetric(sc, "number of files removed."),
     "numAddedFiles" -> createMetric(sc, "number of files added."),
-    "numDeletedRows" -> createMetric(sc, "number of rows deleted."),
-    "numTotalRows" -> createMetric(sc, "total number of rows.")
+    "numDeletedRows" -> createMetric(sc, "number of rows deleted.")
   )
 
   final override def run(sparkSession: SparkSession): Seq[Row] = {
@@ -96,6 +95,7 @@ case class DeleteCommand(
         scanTimeMs = (System.nanoTime() - startTime) / 1000 / 1000
 
         val operationTimestamp = System.currentTimeMillis()
+        metrics("numRemovedFiles").set(allFiles.size)
         allFiles.map(_.removeWithTimestamp(operationTimestamp))
       case Some(cond) =>
         val (metadataPredicates, otherPredicates) =
@@ -111,6 +111,7 @@ case class DeleteCommand(
           scanTimeMs = (System.nanoTime() - startTime) / 1000 / 1000
           numTouchedFiles = candidateFiles.size
 
+          metrics("numRemovedFiles").set(numTouchedFiles)
           candidateFiles.map(_.removeWithTimestamp(operationTimestamp))
         } else {
           // Case 3: Delete the rows based on the condition.
@@ -125,12 +126,7 @@ case class DeleteCommand(
           // that only involves the affected files instead of all files.
           val newTarget = DeltaTableUtils.replaceFileIndex(target, fileIndex)
           val data = Dataset.ofRows(sparkSession, newTarget)
-          val totalRowsCount = metrics("numTotalRows")
           val deletedRowCount = metrics("numDeletedRows")
-          val totalRowUdf = udf { () =>
-            totalRowsCount += 1
-            true
-          }.asNondeterministic()
           val deletedRowUdf = udf { () =>
             deletedRowCount += 1
             true
@@ -141,14 +137,14 @@ case class DeleteCommand(
                 Array.empty[String]
               } else {
                 data
-                  .filter(totalRowUdf())
                   .filter(new Column(cond))
                   .filter(deletedRowUdf())
                   .select(new Column(InputFileName())).distinct()
                   .as[String].collect()
               }
             }
 
+          metrics("numRemovedFiles").set(filesToRewrite.size)
           scanTimeMs = (System.nanoTime() - startTime) / 1000 / 1000
           if (filesToRewrite.isEmpty) {
             // Case 3.1: no row matches and no delete will be triggered
@@ -181,7 +177,6 @@ case class DeleteCommand(
         }
     }
     if (deleteActions.nonEmpty) {
-      metrics("numRemovedFiles").set(numTouchedFiles)
       metrics("numAddedFiles").set(numRewrittenFiles)
       txn.registerSQLMetrics(sparkSession, metrics)
       txn.commit(deleteActions, DeltaOperations.Delete(condition.map(_.sql).toSeq))
diff --git a/src/main/scala/org/apache/spark/sql/delta/commands/UpdateCommand.scala b/src/main/scala/org/apache/spark/sql/delta/commands/UpdateCommand.scala
@@ -56,8 +56,7 @@ case class UpdateCommand(
   override lazy val metrics = Map[String, SQLMetric](
     "numAddedFiles" -> createMetric(sc, "number of files added."),
     "numRemovedFiles" -> createMetric(sc, "number of files removed."),
-    "numUpdatedRows" -> createMetric(sc, "number of rows updated."),
-    "numTotalRows" -> createMetric(sc, "number of rows copied.")
+    "numUpdatedRows" -> createMetric(sc, "number of rows updated.")
   )
 
   final override def run(sparkSession: SparkSession): Seq[Row] = {
@@ -123,20 +122,14 @@ case class UpdateCommand(
       // that only involves the affected files instead of all files.
       val newTarget = DeltaTableUtils.replaceFileIndex(target, fileIndex)
       val data = Dataset.ofRows(sparkSession, newTarget)
-      val totalRowsCount = metrics("numTotalRows")
       val updatedRowCount = metrics("numUpdatedRows")
-      val totalRowUdf = udf { () =>
-        totalRowsCount += 1
-        true
-      }.asNondeterministic()
       val updatedRowUdf = udf { () =>
         updatedRowCount += 1
         true
       }.asNondeterministic()
       val filesToRewrite =
         withStatusCode("DELTA", s"Finding files to rewrite for UPDATE operation") {
-          data.filter(totalRowUdf())
-            .filter(new Column(updateCondition))
+          data.filter(new Column(updateCondition))
             .filter(updatedRowUdf())
             .select(input_file_name())
             .distinct().as[String].collect()
diff --git a/src/test/scala/org/apache/spark/sql/delta/DeltaVacuumSuite.scala b/src/test/scala/org/apache/spark/sql/delta/DeltaVacuumSuite.scala
@@ -378,7 +378,7 @@ trait DeltaVacuumSuiteBase extends QueryTest
           "numRemovedFiles" -> createMetric(sparkContext, "number of files removed."),
           "numAddedFiles" -> createMetric(sparkContext, "number of files added."),
           "numDeletedRows" -> createMetric(sparkContext, "number of rows deleted."),
-          "numTotalRows" -> createMetric(sparkContext, "total number of rows.")
+          "numCopiedRows" -> createMetric(sparkContext, "total number of rows.")
         )
         txn.registerSQLMetrics(spark, metrics)
         txn.commit(Seq(RemoveFile(path, Option(clock.getTimeMillis()))), Delete("true" :: Nil))
diff --git a/src/test/scala/org/apache/spark/sql/delta/DescribeDeltaHistorySuite.scala b/src/test/scala/org/apache/spark/sql/delta/DescribeDeltaHistorySuite.scala
@@ -452,29 +452,40 @@ trait DescribeDeltaHistorySuiteBase
 
   test("operation metrics - update") {
     withSQLConf(DeltaSQLConf.DELTA_HISTORY_METRICS_ENABLED.key -> "true") {
-      val numRows = 100
-      val numPartitions = 5
       withTempDir { tempDir =>
-        // Create a Delta table
-        spark.range(numRows).repartition(numPartitions)
-          .withColumnRenamed("id", "key")
+        // Create the initial table as a single file
+        Seq(1, 2, 5, 11, 21, 3, 4, 6, 9, 7, 8, 0).toDF("key")
           .withColumn("value", 'key % 2)
           .write
           .format("delta")
           .save(tempDir.getAbsolutePath)
-        val deltaTable = io.delta.tables.DeltaTable.forPath(tempDir.getAbsolutePath)
 
-        // update some records
-        deltaTable.update(col("key") < 1, Map("key" -> lit(1)))
+        // append additional data with the same number range to the table.
+        // This data is saved as a separate file as well
+        Seq(15, 16, 17).toDF("key")
+          .withColumn("value", 'key % 2)
+          .repartition(1)
+          .write
+          .format("delta")
+          .mode("append")
+          .save(tempDir.getAbsolutePath)
+        val deltaTable = io.delta.tables.DeltaTable.forPath(spark, tempDir.getAbsolutePath)
+        val deltaLog = DeltaLog.forTable(spark, tempDir.getAbsolutePath)
+        deltaLog.snapshot.numOfFiles
+
+        // update the table
+        deltaTable.update(col("key") === lit("16"), Map("value" -> lit("1")))
+        // The file from the append gets updated but the file from the initial table gets scanned
+        // as well. We want to make sure numCopied rows is calculated from written files and not
+        // scanned files[SC-33980]
 
-        // check operation metrics
+        // get operation metrics
         val operationMetrics = getOperationMetrics(deltaTable.history(1))
-        var expectedRowCount = numRows - 1
         val expectedMetrics = Map(
           "numAddedFiles" -> "1",
           "numRemovedFiles" -> "1",
           "numUpdatedRows" -> "1",
-          "numCopiedRows" -> expectedRowCount.toString
+          "numCopiedRows" -> "2" // There should be only three rows in total(updated + copied)
         )
         checkOperationMetrics(expectedMetrics, operationMetrics, DeltaOperationMetrics.UPDATE)
       }
@@ -517,31 +528,40 @@ trait DescribeDeltaHistorySuiteBase
 
   test("operation metrics - delete") {
     withSQLConf(DeltaSQLConf.DELTA_HISTORY_METRICS_ENABLED.key -> "true") {
-      val numRows = 100
-      val rowsToDelete = 10
       withTempDir { tempDir =>
-        // Create a delta table
-        spark.range(numRows).repartition(5)
-          .withColumnRenamed("id", "key")
+        // Create the initial table as a single file
+        Seq(1, 2, 5, 11, 21, 3, 4, 6, 9, 7, 8, 0).toDF("key")
           .withColumn("value", 'key % 2)
+          .repartition(1)
           .write
           .format("delta")
           .save(tempDir.getAbsolutePath)
-        val deltaTable = io.delta.tables.DeltaTable.forPath(tempDir.getAbsolutePath)
+
+        // Append to the initial table additional data in the same numerical range
+        Seq(15, 16, 17).toDF("key")
+          .withColumn("value", 'key % 2)
+          .repartition(1)
+          .write
+          .format("delta")
+          .mode("append")
+          .save(tempDir.getAbsolutePath)
+        val deltaTable = io.delta.tables.DeltaTable.forPath(spark, tempDir.getAbsolutePath)
         val deltaLog = DeltaLog.forTable(spark, tempDir.getAbsolutePath)
-        val numFilesBeforeDelete = deltaLog.snapshot.numOfFiles
+        deltaLog.snapshot.numOfFiles
 
-        // delete records
-        deltaTable.delete(col("key") < rowsToDelete)
+        // delete the table
+        deltaTable.delete(col("key") === lit("16"))
+        // The file from the append gets deleted but the file from the initial table gets scanned
+        // as well. We want to make sure numCopied rows is calculated from the written files instead
+        // of the scanned files.[SC-33980]
 
-        // check operation metrics
-        val numFilesAfterDelete = deltaLog.snapshot.numOfFiles
+        // get operation metrics
         val operationMetrics = getOperationMetrics(deltaTable.history(1))
         val expectedMetrics = Map(
-          "numAddedFiles" -> numFilesAfterDelete.toString,
-          "numRemovedFiles" -> numFilesBeforeDelete.toString,
-          "numDeletedRows" -> rowsToDelete.toString,
-          "numCopiedRows" -> (numRows - rowsToDelete).toString
+          "numAddedFiles" -> "1",
+          "numRemovedFiles" -> "1",
+          "numDeletedRows" -> "1",
+          "numCopiedRows" -> "2" // There should be only three rows in total(deleted + copied)
         )
         checkOperationMetrics(expectedMetrics, operationMetrics, DeltaOperationMetrics.DELETE)
       }

Original file line number	Diff line number	Diff line change
`@@ -378,7 +378,7 @@ trait DeltaVacuumSuiteBase extends QueryTest`
`378`	`378`	`"numRemovedFiles" -> createMetric(sparkContext, "number of files removed."),`
`379`	`379`	`"numAddedFiles" -> createMetric(sparkContext, "number of files added."),`
`380`	`380`	`"numDeletedRows" -> createMetric(sparkContext, "number of rows deleted."),`
`381`		`- "numTotalRows" -> createMetric(sparkContext, "total number of rows.")`
	`381`	`+ "numCopiedRows" -> createMetric(sparkContext, "total number of rows.")`
`382`	`382`	`)`
`383`	`383`	`txn.registerSQLMetrics(spark, metrics)`
`384`	`384`	`txn.commit(Seq(RemoveFile(path, Option(clock.getTimeMillis()))), Delete("true" :: Nil))`