Merge pull request #92 from pspoerri/ps/fix_pipeline

BernardMetzler · web-flow · commit 0be5f0e602d6 · 2025-04-16T15:11:22.000+02:00
Fix Github CI build and some minor linting issues.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -53,7 +53,7 @@ jobs:
             scala: 2.12.18
           - spark: 3.5.0
             scala: 2.13.8
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04 # Upgrading this version requires an additional sbt setup step.
     env:
       SPARK_VERSION: ${{ matrix.spark }}
       SCALA_VERSION: ${{ matrix.scala }}
@@ -68,18 +68,17 @@ jobs:
         cache: sbt
     - name: Check formatting
       shell: bash
-      run: |
+      run: |-
         echo "If either of these checks fail run: 'sbt scalafmtAll && sbt scalafmtSbt'"
         sbt scalafmtSbtCheck
         sbt scalafmtCheckAll
-    - name: Test Default Shuffle Fetch
+    - name: Run tests
       shell: bash
-      if: startsWith(matrix.scala, '2.12.')
       run: |
         sbt test
-    - name: Test Spark Shuffle Fetch
+    - name: Run tests with Spark Shuffle Fetch enabled
       shell: bash
-      if: startsWith(matrix.scala, '2.12.') && !startsWith(matrix.spark, '3.2.')
+      if: ${{ !startsWith(matrix.spark, '3.2.') }}
       env:
         USE_SPARK_SHUFFLE_FETCH: "true"
       run: |
diff --git a/README.md b/README.md
@@ -101,8 +101,8 @@ to Java > 11:
   --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED
   --add-opens=java.base/sun.nio.ch=ALL-UNNAMED
   --add-opens=java.base/sun.nio.cs=ALL-UNNAMED
-  --add-opens=java.base/sun.security.action=ALL-UNNAMED -
-  -add-opens=java.base/sun.util.calendar=ALL-UNNAMED
+  --add-opens=java.base/sun.security.action=ALL-UNNAMED 
+  --add-opens=java.base/sun.util.calendar=ALL-UNNAMED
   --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED
 ```
 
diff --git a/build.sbt b/build.sbt
@@ -3,10 +3,10 @@
 // SPDX-License-Identifier: Apache2.0
 //
 
-scalaVersion := sys.env.getOrElse("SCALA_VERSION", "2.12.15")
+scalaVersion := sys.env.getOrElse("SCALA_VERSION", "2.12.18")
 organization := "com.ibm"
 name := "spark-s3-shuffle"
-val sparkVersion = sys.env.getOrElse("SPARK_VERSION", "3.3.1")
+val sparkVersion = sys.env.getOrElse("SPARK_VERSION", "3.5.0")
 
 enablePlugins(GitVersioning, BuildInfoPlugin)
 
@@ -29,21 +29,10 @@ buildInfoKeys ++= Seq[BuildInfoKey](
 libraryDependencies ++= Seq(
   "org.apache.spark" %% "spark-core" % sparkVersion % "provided",
   "org.apache.spark" %% "spark-sql" % sparkVersion % "provided",
-  "org.apache.spark" %% "spark-hadoop-cloud" % sparkVersion % "compile"
+  "org.apache.spark" %% "spark-hadoop-cloud" % sparkVersion % "compile",
+  "org.scalatest" %% "scalatest" % "3.2.19" % Test
 )
 
-libraryDependencies ++= (if (scalaBinaryVersion.value == "2.12")
-                           Seq(
-                             "junit" % "junit" % "4.13.2" % Test,
-                             "org.scalatest" %% "scalatest" % "3.2.2" % Test,
-                             "ch.cern.sparkmeasure" %% "spark-measure" % "0.18" % Test,
-                             "org.scalacheck" %% "scalacheck" % "1.15.2" % Test,
-                             "org.mockito" % "mockito-core" % "3.4.6" % Test,
-                             "org.scalatestplus" %% "mockito-3-4" % "3.2.9.0" % Test,
-                             "com.github.sbt" % "junit-interface" % "0.13.3" % Test
-                           )
-                         else Seq())
-
 javacOptions ++= Seq("-source", "1.8", "-target", "1.8")
 javaOptions ++= Seq("-Xms512M", "-Xmx2048M", "-XX:MaxPermSize=2048M", "-XX:+CMSClassUnloadingEnabled")
 scalacOptions ++= Seq("-deprecation", "-unchecked")
diff --git a/src/main/scala/org/apache/spark/shuffle/helper/S3ShuffleDispatcher.scala b/src/main/scala/org/apache/spark/shuffle/helper/S3ShuffleDispatcher.scala
@@ -211,7 +211,6 @@ class S3ShuffleDispatcher extends Logging {
   def closeCachedBlocks(shuffleIndex: Int): Unit = {
     val filter = (blockId: BlockId) =>
       blockId match {
-        case RDDBlockId(_, _)                              => false
         case ShuffleBlockId(shuffleId, _, _)               => shuffleId == shuffleIndex
         case ShuffleBlockBatchId(shuffleId, _, _, _)       => shuffleId == shuffleIndex
         case ShuffleBlockChunkId(shuffleId, _, _, _)       => shuffleId == shuffleIndex
@@ -223,14 +222,9 @@ class S3ShuffleDispatcher extends Logging {
         case ShuffleMergedDataBlockId(_, shuffleId, _, _)  => shuffleId == shuffleIndex
         case ShuffleMergedIndexBlockId(_, shuffleId, _, _) => shuffleId == shuffleIndex
         case ShuffleMergedMetaBlockId(_, shuffleId, _, _)  => shuffleId == shuffleIndex
-        case BroadcastBlockId(_, _)                        => false
-        case TaskResultBlockId(_)                          => false
-        case StreamBlockId(_, _)                           => false
-        case TempLocalBlockId(_)                           => false
-        case TempShuffleBlockId(_)                         => false
-        case TestBlockId(_)                                => false
+        case _                                             => false
       }
-    cachedFileStatus.remove(filter, _)
+    cachedFileStatus.remove(filter, None)
   }
 
   /** Open a block for writing.
diff --git a/src/main/scala/org/apache/spark/shuffle/sort/S3ShuffleManager.scala b/src/main/scala/org/apache/spark/shuffle/sort/S3ShuffleManager.scala
@@ -23,20 +23,15 @@
 package org.apache.spark.shuffle.sort
 
 import com.ibm.SparkS3ShuffleBuild
-import org.apache.hadoop.fs.{Path, PathFilter}
 import org.apache.spark._
-import org.apache.spark.internal.{Logging, config}
+import org.apache.spark.internal.Logging
 import org.apache.spark.shuffle._
 import org.apache.spark.shuffle.api.ShuffleExecutorComponents
 import org.apache.spark.shuffle.helper.{S3ShuffleDispatcher, S3ShuffleHelper}
 import org.apache.spark.storage.S3ShuffleReader
 
-import java.io.IOException
 import scala.collection.JavaConverters._
 import scala.collection.mutable
-import scala.concurrent.ExecutionContext.Implicits.global
-import scala.concurrent.duration.Duration
-import scala.concurrent.{Await, Future}
 
 /** This class was adapted from Apache Spark: SortShuffleManager.scala
   */
diff --git a/src/test/scala/org/apache/spark/shuffle/S3ShuffleManagerTest.scala b/src/test/scala/org/apache/spark/shuffle/S3ShuffleManagerTest.scala
@@ -22,11 +22,11 @@
 
 package org.apache.spark.shuffle
 
-import ch.cern.sparkmeasure.StageMetrics
 import org.apache.spark._
 import org.apache.spark.sql.SparkSession
-import org.junit.Test
-import org.scalatest.Assertions._
+import org.scalatest._
+import org.scalatest.funsuite.AnyFunSuite
+import org.scalatest.matchers.should.Matchers
 
 import java.util.UUID
 
@@ -39,24 +39,21 @@ case class CombinerClass()
 /*
  * The test has been adapted from the following pull request https://github.com/apache/spark/pull/34864/files .
  */
-class S3ShuffleManagerTest {
+class S3ShuffleManagerTest extends AnyFunSuite {
 
-  @Test
-  def foldByKey(): Unit = {
+  test("foldByKey") {
     val conf = newSparkConf()
     runWithSparkConf(conf)
   }
 
-  @Test
-  def foldByKey_zeroBuffering(): Unit = {
+  test("foldByKey_zeroBuffering") {
     val conf = newSparkConf()
     conf.set("spark.reducer.maxSizeInFlight", "0")
     conf.set("spark.network.maxRemoteBlockSizeFetchToMem", "0")
     runWithSparkConf(conf)
   }
 
-  @Test
-  def runWithSparkConf_noMapSideCombine(): Unit = {
+  test("runWithSparkConf_noMapSideCombine") {
     val conf = newSparkConf()
     conf.set("spark.shuffle.sort.bypassMergeThreshold", "1000")
     val sc = new SparkContext(conf)
@@ -75,8 +72,7 @@ class S3ShuffleManagerTest {
     }
   }
 
-  @Test
-  def forceSortShuffle(): Unit = {
+  test("forceSortShuffle") {
     val conf = newSparkConf()
     conf.set("spark.shuffle.sort.bypassMergeThreshold", "1")
     val sc = new SparkContext(conf)
@@ -104,8 +100,7 @@ class S3ShuffleManagerTest {
     }
   }
 
-  @Test
-  def testCombineByKey(): Unit = {
+  test("testCombineByKey") {
     val conf = newSparkConf()
     val sc = new SparkContext(conf)
     try {
@@ -148,8 +143,7 @@ class S3ShuffleManagerTest {
     }
   }
 
-  @Test
-  def teraSortLike(): Unit = {
+  test("teraSortLike") {
     val conf = newSparkConf()
     conf.set("spark.shuffle.sort.bypassMergeThreshold", "1")
     val sc = new SparkContext(conf)
@@ -179,34 +173,6 @@ class S3ShuffleManagerTest {
     }
   }
 
-  @Test
-  def runWithSparkMeasure(): Unit = {
-    val conf = newSparkConf()
-    val sc = new SparkContext(conf)
-    val spark = SparkSession.builder().sparkContext(sc).getOrCreate()
-    val stageMetrics = StageMetrics(spark)
-    val result = stageMetrics.runAndMeasure {
-      spark.sql("select count(*) from range(1000) cross join range(1000) cross join range(1000)").take(1)
-    }
-    assert(result.map(r => r.getLong(0)).head === 1000000000)
-
-    val timestamp = System.currentTimeMillis()
-    stageMetrics.createStageMetricsDF(s"spark_measure_test_${timestamp}")
-    val metrics = stageMetrics.aggregateStageMetrics(s"spark_measure_test_${timestamp}")
-    // get all of the stats
-    val (runTime, bytesRead, recordsRead, bytesWritten, recordsWritten) =
-      metrics
-        .select("elapsedTime", "bytesRead", "recordsRead", "bytesWritten", "recordsWritten")
-        .take(1)
-        .map(r => (r.getLong(0), r.getLong(1), r.getLong(2), r.getLong(3), r.getLong(4)))
-        .head
-    println(
-      f"Elapsed: ${runTime}, bytesRead: ${bytesRead}, recordsRead: ${recordsRead}, bytesWritten ${bytesWritten}, recordsWritten: ${recordsWritten}"
-    )
-    spark.stop()
-    spark.close()
-  }
-
   private def runWithSparkConf(conf: SparkConf) = {
     val sc = new SparkContext(conf)