Fix bug in twoStepGradientAggregator method. (#43)

kjytay · web-flow · commit b2e6e957a15c · 2023-07-25T11:01:32.000-07:00
diff --git a/dualip/src/main/scala/com/linkedin/dualip/objective/distributedobjective/DistributedRegularizedObjective.scala b/dualip/src/main/scala/com/linkedin/dualip/objective/distributedobjective/DistributedRegularizedObjective.scala
@@ -132,6 +132,42 @@ object DistributedRegularizedObjective {
     PartialPrimalStats(ax.toArray, cx, xx)
   }
 
+  /**
+    * This method accumulates sufficient statistics (i.e. ax, cx and xx) from a dataset of PartialPrimalStats.
+    * It returns an Array[(Int, Array[Double])] where the first entry corresponds to the index of the dual vector.
+    * We always store cx and xx in the last two indices. For example, if the dual is of dimension 20, the maximum value
+    * of the returned indices would be 21 (0 to 19 for the indices of the dual vector and 20 and 21 for cx and xx
+    * respectively).
+    *
+    * @param primalStats
+    * @param lambdaDim
+    * @param numPartitions
+    * @param sparkSession
+    * @return
+    */
+  def accumulateSufficientStatistics(primalStats: Dataset[PartialPrimalStats], lambdaDim: Int, numPartitions: Int)
+    (implicit sparkSession: SparkSession): Array[(Int, Array[Double])] = {
+    import sparkSession.implicits._
+    primalStats.mapPartitions { partitionIterator =>
+      val acxxAgg = new Array[Double](lambdaDim + 2)
+      partitionIterator.foreach { stats =>
+        val ax = stats.costs
+        var i = 0
+        val axLen = ax.length
+        while (i < axLen) {
+          val (axIndex, axValue) = ax(i)
+          acxxAgg(axIndex) += axValue
+          i += 1
+        }
+        acxxAgg(lambdaDim) += stats.objective
+        acxxAgg(lambdaDim + 1) += stats.xx
+      }
+      // partition array
+      val x = ArrayAggregation.partitionArray(acxxAgg, numPartitions)
+      x.iterator
+    }.rdd.reduceByKey(ArrayAggregation.aggregateArrays(_, _)).collect()
+  }
+
   /**
     * Does aggregation in the following way:
     * 1. each data partition performs aggregation of the gradients into java Array (dense)
@@ -158,37 +194,43 @@ object DistributedRegularizedObjective {
     */
   def twoStepGradientAggregator(primalStats: Dataset[PartialPrimalStats], lambdaDim: Int, numPartitions: Int)
     (implicit sparkSession: SparkSession): PartialPrimalStats = {
-    import sparkSession.implicits._
-    val aggregate = primalStats.mapPartitions { partitionIterator =>
-      val acxxAgg = new Array[Double](lambdaDim + 2)
-      partitionIterator.foreach { stats =>
-        val ax = stats.costs
-        var i = 0
-        while (i < ax.length) {
-          val (axIndex, axValue) = ax(i)
-          acxxAgg(axIndex) += axValue
-          i += 1
-        }
-        acxxAgg(lambdaDim) += stats.objective
-        acxxAgg(lambdaDim + 1) += stats.xx
-      }
-      // partition array
-      val x = ArrayAggregation.partitionArray(acxxAgg, numPartitions)
-      x.iterator
-    }.rdd.reduceByKey(ArrayAggregation.aggregateArrays(_, _)).collect()
 
+    val aggregatedStats = accumulateSufficientStatistics(primalStats, lambdaDim, numPartitions)
     val ax = new Array[Double](lambdaDim)
     var cx = 0.0
     var xx = 0.0
-    aggregate.foreach { case (partition, subarray) =>
-      val (start, end) = ArrayAggregation.partitionBounds(lambdaDim + 2, numPartitions, partition)
-      if (partition == numPartitions - 1) {
-        // special case for last partition, as it holds 'xx' and 'cx' in the last two positions
-        cx = subarray(subarray.length - 2)
-        xx = subarray(subarray.length - 1)
-        System.arraycopy(subarray, 0, ax, start, subarray.length - 2)
-      } else {
-        System.arraycopy(subarray, 0, ax, start, subarray.length)
+    val axLen = ax.length
+    aggregatedStats.foreach { case (partition, subarray) =>
+      val (startIndex, endIndex) = ArrayAggregation.partitionBounds(lambdaDim + 2, numPartitions, partition)
+      if (partition < (numPartitions - 2) || (partition == numPartitions - 2 && endIndex <= axLen)) {
+        // aggregation of the ax values for different indices of the dual when we haven't reached the last two
+        // partitions or we have reached the second last partition and it still contains dual values only (i.e. no cx)
+        System.arraycopy(subarray, 0, ax, startIndex, subarray.length)
+      }
+      else if (partition == (numPartitions - 2)) {
+        // once we hit the second last partition with endIndex > axLen, we definitely have cx in here
+        cx = subarray(subarray.length - 1)
+        if (subarray.length > 1) {
+          // along with some of the remaining duals
+          System.arraycopy(subarray, 0, ax, startIndex, subarray.length - 1)
+        }
+      }
+      else {
+        // when we hit the last partition
+        if (subarray.length == 1) {
+          // and it contains only one element, it has to be xx
+          xx = subarray(0)
+        }
+        else {
+          // if the last partition has more than one element, then the last two elements must be cx and xx respectively
+          cx = subarray(subarray.length - 2)
+          xx = subarray(subarray.length - 1)
+          if (subarray.length > 2) {
+            // if the last partition has more than two elements then it must contain few of the remaining duals and cx
+            // and xx
+            System.arraycopy(subarray, 0, ax, startIndex, subarray.length - 2)
+          }
+        }
       }
     }
     PartialPrimalStats(ax.zipWithIndex.map { case (v, i) => (i, v) }, cx, xx)
diff --git a/dualip/src/main/scala/com/linkedin/dualip/util/ArrayAggregation.scala b/dualip/src/main/scala/com/linkedin/dualip/util/ArrayAggregation.scala
@@ -49,9 +49,9 @@ object ArrayAggregation {
     * Method to find [start, end) positions in the array of a given partition.
     * Array is partitioned into roughly identical partitions, the length of a partition may differ by one.
     *
-    * @param arrayLength
-    * @param numPartitions
-    * @param partition
+    * @param arrayLength   - length of the array whose slicing positions are being considered
+    * @param numPartitions - the total number of partitions that the array of length arrayLength is stored in
+    * @param partition     - the index value for the partitions that can assume values from 0 to numPartitions - 1
     * @return
     */
   def partitionBounds(arrayLength: Int, numPartitions: Int, partition: Int): (Int, Int) = {
@@ -61,13 +61,17 @@ object ArrayAggregation {
     // we will stack larger partitions in the beginning of the array
     val basePartitionSize = arrayLength / numPartitions
     val numLargerPartitions = arrayLength - basePartitionSize * numPartitions
-    // second term accounts for larger partitions stacked prior to partition in question
-    val startIndex = partition * basePartitionSize + math.min(partition, numLargerPartitions)
+
     val partitionSize = if (partition < numLargerPartitions) {
       basePartitionSize + 1
     } else {
       basePartitionSize
     }
+    val startIndex = if (partition < numLargerPartitions) {
+      partition * partitionSize
+    } else {
+      numLargerPartitions + partition * partitionSize
+    }
     val endIndex = startIndex + partitionSize
     (startIndex, endIndex)
   }
diff --git a/dualip/src/test/scala/com/linkedin/dualip/objective/distributedobjective/DistributedRegularizedObjectiveTest.scala b/dualip/src/test/scala/com/linkedin/dualip/objective/distributedobjective/DistributedRegularizedObjectiveTest.scala
@@ -1,6 +1,8 @@
 package com.linkedin.dualip.objective.distributedobjective
 
 import com.linkedin.dualip.objective.PartialPrimalStats
+import com.linkedin.dualip.objective.distributedobjective.DistributedRegularizedObjective.accumulateSufficientStatistics
+import com.linkedin.dualip.util.ArrayAggregation.partitionBounds
 import com.linkedin.spark.common.lib.TestUtils
 import org.apache.spark.SparkConf
 import org.apache.spark.sql.SparkSession
@@ -15,15 +17,38 @@ class DistributedRegularizedObjectiveTest {
   val expectedAx: Map[Int, Double] = partialGradientsTestData.flatMap(_.costs).groupBy(_._1).mapValues(_.map(_._2).sum)
   val expectedXx: Double = partialGradientsTestData.map(_.xx).sum
 
+  @Test
+  def testAccumulateSufficientStatistics(): Unit = {
+    implicit val spark: SparkSession = TestUtils.createSparkSession("testAccumulateSufficientStatistics")
+    import spark.implicits._
+
+    val lambdaDim = 9
+    val numPartitions = 4
+    val primalStats = spark.createDataset(partialGradientsTestData)
+    val aggregatedStats = accumulateSufficientStatistics(primalStats, lambdaDim, 4).toMap
+
+    (0 until numPartitions - 1).foreach { partitionNumber =>
+      val (startIndex, endIndex) = partitionBounds(arrayLength = lambdaDim + 2, numPartitions = numPartitions,
+        partition = partitionNumber)
+      (startIndex until endIndex).zipWithIndex.foreach { case (arrayIndex, index) =>
+        Assert.assertEquals(aggregatedStats(partitionNumber)(index), expectedAx(arrayIndex), 0.01)
+      }
+    }
+  }
+
   @Test
   def testTwoStepGradientAggregation(): Unit = {
-    implicit val spark: SparkSession = TestUtils.createSparkSession()
+    implicit val spark: SparkSession = TestUtils.createSparkSession("testTwoStepGradientAggregation")
     import spark.implicits._
-    val ds = spark.createDataset(partialGradientsTestData).repartition(2)
-    val aggPrimalStats = DistributedRegularizedObjective.twoStepGradientAggregator(ds, 9, 2)
-    assertAlmostEqual(aggPrimalStats.costs.toMap, expectedAx)
-    assertAlmostEqual(aggPrimalStats.objective, expectedCx)
-    assertAlmostEqual(aggPrimalStats.xx, expectedXx)
+
+    Array(2, 5, 9).foreach { numPartitions =>
+      print("number of partitions " + numPartitions + "\n")
+      val ds = spark.createDataset(partialGradientsTestData).repartition(numPartitions)
+      val aggPrimalStats = DistributedRegularizedObjective.twoStepGradientAggregator(ds, 9, numPartitions)
+      assertAlmostEqual(aggPrimalStats.costs.toMap, expectedAx)
+      assertAlmostEqual(aggPrimalStats.objective, expectedCx)
+      assertAlmostEqual(aggPrimalStats.xx, expectedXx)
+    }
   }
 
   @Test
diff --git a/dualip/src/test/scala/com/linkedin/dualip/util/ArrayAggregationTest.scala b/dualip/src/test/scala/com/linkedin/dualip/util/ArrayAggregationTest.scala
@@ -1,5 +1,6 @@
 package com.linkedin.dualip.util
 
+import com.linkedin.dualip.objective.distributedobjective.DistributedRegularizedObjective.accumulateSufficientStatistics
 import com.linkedin.spark.common.lib.TestUtils
 import org.apache.spark.sql.{Dataset, SparkSession}
 import org.testng.Assert
@@ -31,11 +32,13 @@ class ArrayAggregationTest {
   @Test
   def testPartitionBounds(): Unit = {
     // even split into partitions
-    Assert.assertEquals(partitionBounds(arrayLength = 8, numPartitions = 2, partition = 1), (4,8))
+    Assert.assertEquals(partitionBounds(arrayLength = 8, numPartitions = 2, partition = 1), (4, 8))
     // uneven split, larger partitions should be stacked first
-    Assert.assertEquals(partitionBounds(arrayLength = 8, numPartitions = 3, partition = 0), (0,3))
-    Assert.assertEquals(partitionBounds(arrayLength = 8, numPartitions = 3, partition = 1), (3,6))
-    Assert.assertEquals(partitionBounds(arrayLength = 8, numPartitions = 3, partition = 2), (6,8))
+    Assert.assertEquals(partitionBounds(arrayLength = 8, numPartitions = 3, partition = 0), (0, 3))
+    Assert.assertEquals(partitionBounds(arrayLength = 8, numPartitions = 3, partition = 1), (3, 6))
+    Assert.assertEquals(partitionBounds(arrayLength = 8, numPartitions = 3, partition = 2), (6, 8))
+    Assert.assertEquals(partitionBounds(arrayLength = 40, numPartitions = 20, partition = 10), (20, 22))
+    Assert.assertEquals(partitionBounds(arrayLength = 40, numPartitions = 38, partition = 37), (39, 40))
   }
 
   @Test(