From 8668754ff8d9c093d1e143f2bde9fce7744f54c8 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Tue, 18 Feb 2025 16:25:04 -0800 Subject: [PATCH 001/120] compilable --- .../celeborn/spark/FailedShuffleCleaner.scala | 148 ++++++++++++++++++ .../org/apache/spark/SparkContextHelper.scala | 1 + .../spark/scheduler/RunningStageManager.scala | 30 ++++ .../shuffle/celeborn/SparkShuffleManager.java | 12 +- .../spark/shuffle/celeborn/SparkUtils.java | 14 ++ .../celeborn/client/LifecycleManager.scala | 24 ++- .../commit/ReducePartitionCommitHandler.scala | 4 +- .../apache/celeborn/common/CelebornConf.scala | 9 ++ 8 files changed, 236 insertions(+), 6 deletions(-) create mode 100644 client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala rename {tests/spark-it/src/test => client-spark/common/src/main}/scala/org/apache/spark/SparkContextHelper.scala (99%) create mode 100644 client-spark/common/src/main/scala/org/apache/spark/scheduler/RunningStageManager.scala diff --git a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala new file mode 100644 index 00000000000..c0827c6e6d1 --- /dev/null +++ b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala @@ -0,0 +1,148 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.celeborn.spark +import java.util +import java.util.concurrent.{ConcurrentHashMap, LinkedBlockingQueue} +import java.util.concurrent.atomic.AtomicReference +import scala.collection.JavaConverters._ +import scala.collection.mutable + +import org.apache.spark.scheduler.{RunningStageManager, RunningStageManagerImpl} +import org.apache.celeborn.client.LifecycleManager +import org.apache.celeborn.common.internal.Logging + +private[celeborn] object FailedShuffleCleaner extends Logging { + + private val lifecycleManager = new AtomicReference[LifecycleManager](null) + // in celeborn ids + private val shufflesToBeCleand = new LinkedBlockingQueue[Int]() + private val cleanedShuffleIds = new mutable.HashSet[Int] + // celeborn shuffle id to stage id referred to it + private[celeborn] val celebornShuffleIdToReferringStages = + new ConcurrentHashMap[Int, mutable.HashSet[Int]]() + + private val lock = new Object + val RUNNING_STAGE_CHECKER_CLASS = "CELEBORN_TEST_RUNNING_STAGE_CHECKER_IMPL" + + private[celeborn] var runningStageManager: RunningStageManager = buildRunningStageChecker() + + // for testing + private def buildRunningStageChecker(): RunningStageManager = { + if (System.getProperty(RUNNING_STAGE_CHECKER_CLASS) == null) { + new RunningStageManagerImpl + } else { + val className = System.getProperty(RUNNING_STAGE_CHECKER_CLASS) + val claz = Class.forName(className) + claz.getDeclaredConstructor().newInstance().asInstanceOf[RunningStageManager] + } + } + + // for test + def reset(): Unit = { + lifecycleManager.set(null) + shufflesToBeCleand.clear() + cleanedShuffleIds.clear() + celebornShuffleIdToReferringStages.clear() + runningStageManager = buildRunningStageChecker() + } + + def addShuffleIdReferringStage(celebornShuffleId: Int, appShuffleIdentifier: String): Unit = { + // this is only implemented/tested with Spark for now + val Array(_, stageId, _) = appShuffleIdentifier.split('-') + val stageIds = + celebornShuffleIdToReferringStages.computeIfAbsent( + celebornShuffleId, + (_) => new mutable.HashSet[Int]()) + lock.synchronized { + stageIds.add(stageId.toInt) + } + } + + private def onlyCurrentStageReferred(celebornShuffleId: Int, stageId: Int): Boolean = { + val ret = celebornShuffleIdToReferringStages.get(celebornShuffleId).size == 1 && + celebornShuffleIdToReferringStages.get(celebornShuffleId).contains(stageId) + if (ret) { + logInfo(s"only stage $stageId refers to shuffle $celebornShuffleId, adding for clean up") + } + ret + } + + def addShuffleIdToBeCleaned( + celebornShuffleId: Int, + appShuffleIdentifier: String): Unit = { + val Array(appShuffleId, stageId, _) = appShuffleIdentifier.split('-') + lifecycleManager.get().getShuffleIdMapping.get(appShuffleId.toInt).foreach { + case (pastAppShuffleIdentifier, (celebornShuffleId, _)) => { + if (!celebornShuffleIdToReferringStages.containsKey(celebornShuffleId) + || onlyCurrentStageReferred(celebornShuffleId, stageId.toInt) + || noRunningDownstreamStage(celebornShuffleId) + || !committedSuccessfully(celebornShuffleId)) { + val Array(_, stageId, attemptId) = pastAppShuffleIdentifier.split('-') + shufflesToBeCleand.put(celebornShuffleId) + } + } + } + } + + private def committedSuccessfully(celebornShuffleId: Int): Boolean = { + val ret = !lifecycleManager.get().commitManager.getCommitHandler(celebornShuffleId) + .isStageDataLost(celebornShuffleId) + if (!ret) { + logInfo(s"shuffle $celebornShuffleId is failed to commit, adding for cleaning up") + } + ret + } + + def setLifecycleManager(ref: LifecycleManager): Unit = { + lifecycleManager.compareAndSet(null, ref) + } + + private def noRunningDownstreamStage(shuffleId: Int): Boolean = { + val allReferringStageIds = celebornShuffleIdToReferringStages.get(shuffleId) + require(allReferringStageIds != null, s"no stage referring to shuffle $shuffleId") + val ret = + allReferringStageIds.count(stageId => runningStageManager.isRunningStage(stageId)) == 0 + if (ret) { + logInfo(s"no running downstream stages refers to $shuffleId") + } else { + logInfo(s"there is more than one running downstream stage referring to shuffle $shuffleId," + + s" ignore it for cleanup ") + } + ret + } + + private val cleanerThread = new Thread() { + override def run(): Unit = { + while (true) { + val allShuffleIds = new util.ArrayList[Int] + shufflesToBeCleand.drainTo(allShuffleIds) + allShuffleIds.asScala.foreach { shuffleId => + if (!cleanedShuffleIds.contains(shuffleId)) { + lifecycleManager.get().unregisterShuffle(shuffleId) + logInfo(s"sent unregister shuffle request for shuffle $shuffleId (celeborn shuffle id)") + cleanedShuffleIds += shuffleId + } + } + Thread.sleep(1000) + } + } + } + + cleanerThread.setName("shuffle cleaner thread") + cleanerThread.setDaemon(true) + cleanerThread.start() +} \ No newline at end of file diff --git a/tests/spark-it/src/test/scala/org/apache/spark/SparkContextHelper.scala b/client-spark/common/src/main/scala/org/apache/spark/SparkContextHelper.scala similarity index 99% rename from tests/spark-it/src/test/scala/org/apache/spark/SparkContextHelper.scala rename to client-spark/common/src/main/scala/org/apache/spark/SparkContextHelper.scala index 914cd0830c0..cc7bd44ed7f 100644 --- a/tests/spark-it/src/test/scala/org/apache/spark/SparkContextHelper.scala +++ b/client-spark/common/src/main/scala/org/apache/spark/SparkContextHelper.scala @@ -18,6 +18,7 @@ package org.apache.spark object SparkContextHelper { + def env: SparkEnv = { assert(SparkContext.getActive.isDefined) SparkContext.getActive.get.env diff --git a/client-spark/common/src/main/scala/org/apache/spark/scheduler/RunningStageManager.scala b/client-spark/common/src/main/scala/org/apache/spark/scheduler/RunningStageManager.scala new file mode 100644 index 00000000000..0e3e91eeb26 --- /dev/null +++ b/client-spark/common/src/main/scala/org/apache/spark/scheduler/RunningStageManager.scala @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.scheduler + +import org.apache.spark.SparkContext + +trait RunningStageManager { + def isRunningStage(stageId: Int): Boolean +} + +class RunningStageManagerImpl extends RunningStageManager { + private def dagScheduler = SparkContext.getActive.get.dagScheduler + override def isRunningStage(stageId: Int): Boolean = { + dagScheduler.runningStages.map(_.id).contains(stageId) + } +} \ No newline at end of file diff --git a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java index a3e75cd10a1..77acac9862c 100644 --- a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java +++ b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java @@ -150,7 +150,6 @@ private void initializeLifecycleManager(String appId) { lifecycleManager.registerShuffleTrackerCallback( shuffleId -> SparkUtils.unregisterAllMapOutput(mapOutputTracker, shuffleId)); - if (celebornConf.clientAdaptiveOptimizeSkewedPartitionReadEnabled()) { lifecycleManager.registerCelebornSkewShuffleCheckCallback( SparkUtils::isCelebornSkewShuffleOrChildShuffle); @@ -164,6 +163,17 @@ private void initializeLifecycleManager(String appId) { shuffleId, getReducerFileGroupResponse)); lifecycleManager.registerInvalidatedBroadcastCallback( shuffleId -> SparkUtils.invalidateSerializedGetReducerFileGroupResponse(shuffleId)); + + } + if (lifecycleManager.conf().clientFetchCleanFailedShuffle()) { + lifecycleManager.registerGetShuffleIdForWriterCallback( + (celebornShuffleId, appShuffleIdentifier) -> + SparkUtils.addWriterShuffleIdsToBeCleaned( + lifecycleManager, celebornShuffleId, appShuffleIdentifier)); + lifecycleManager.registerGetShuffleIdForReaderCallback( + (celebornShuffleId, appShuffleIdentifier) -> + SparkUtils.addShuffleIdRefCount( + lifecycleManager, celebornShuffleId, appShuffleIdentifier)); } } } diff --git a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java index b2e64565ec8..260df242c1b 100644 --- a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java +++ b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java @@ -29,6 +29,8 @@ import java.util.concurrent.atomic.LongAdder; import java.util.stream.Collectors; +import org.apache.celeborn.client.LifecycleManager; +import org.apache.celeborn.spark.FailedShuffleCleaner; import scala.Option; import scala.Some; import scala.Tuple2; @@ -625,4 +627,16 @@ public static void invalidateSerializedGetReducerFileGroupResponse(Integer shuff return null; }); } + + public static void addWriterShuffleIdsToBeCleaned( + LifecycleManager lifecycleManager, int celebornShuffeId, String appShuffleIdentifier) { + FailedShuffleCleaner.setLifecycleManager(lifecycleManager); + FailedShuffleCleaner.addShuffleIdToBeCleaned(celebornShuffeId, appShuffleIdentifier); + } + + public static void addShuffleIdRefCount( + LifecycleManager lifecycleManager, int celebornShuffeId, String appShuffleIdentifier) { + FailedShuffleCleaner.setLifecycleManager(lifecycleManager); + FailedShuffleCleaner.addShuffleIdReferringStage(celebornShuffeId, appShuffleIdentifier); + } } diff --git a/client/src/main/scala/org/apache/celeborn/client/LifecycleManager.scala b/client/src/main/scala/org/apache/celeborn/client/LifecycleManager.scala index 20e1099d2aa..110ff1c1612 100644 --- a/client/src/main/scala/org/apache/celeborn/client/LifecycleManager.scala +++ b/client/src/main/scala/org/apache/celeborn/client/LifecycleManager.scala @@ -927,6 +927,7 @@ class LifecycleManager(val appUniqueId: String, val conf: CelebornConf) extends logInfo(s"reuse existing shuffleId $id for appShuffleId $appShuffleId appShuffleIdentifier $appShuffleIdentifier") id } else { + // this branch means it is a redo of previous write stage if (isBarrierStage) { // unregister previous shuffle(s) which are still valid val mapUpdates = shuffleIds.filter(_._2._2).map { kv => @@ -937,6 +938,8 @@ class LifecycleManager(val appUniqueId: String, val conf: CelebornConf) extends } val newShuffleId = shuffleIdGenerator.getAndIncrement() logInfo(s"generate new shuffleId $newShuffleId for appShuffleId $appShuffleId appShuffleIdentifier $appShuffleIdentifier") + getShuffleIdForWriterCallback.foreach(callback => + callback.accept(newShuffleId, appShuffleIdentifier)) shuffleIds.put(appShuffleIdentifier, (newShuffleId, true)) newShuffleId } @@ -950,11 +953,13 @@ class LifecycleManager(val appUniqueId: String, val conf: CelebornConf) extends } else { shuffleIds.values.filter(v => v._2).map(v => v._1).toSeq.reverse.find( areAllMapTasksEnd) match { - case Some(shuffleId) => + case Some(celebornShuffleId) => + getShuffleIdForReaderCallback.foreach(callback => + callback.accept(celebornShuffleId, appShuffleIdentifier)) val pbGetShuffleIdResponse = { logDebug( - s"get shuffleId $shuffleId for appShuffleId $appShuffleId appShuffleIdentifier $appShuffleIdentifier isWriter $isWriter") - PbGetShuffleIdResponse.newBuilder().setShuffleId(shuffleId).setSuccess(true).build() + s"get shuffleId $celebornShuffleId for appShuffleId $appShuffleId appShuffleIdentifier $appShuffleIdentifier isWriter $isWriter") + PbGetShuffleIdResponse.newBuilder().setShuffleId(celebornShuffleId).setSuccess(true).build() } context.reply(pbGetShuffleIdResponse) case None => @@ -1850,6 +1855,17 @@ class LifecycleManager(val appUniqueId: String, val conf: CelebornConf) extends appShuffleTrackerCallback = Some(callback) } + // expecting celeborn shuffle id and application shuffle identifier + @volatile private var getShuffleIdForWriterCallback: Option[BiConsumer[Integer, String]] = None + def registerGetShuffleIdForWriterCallback(callback: BiConsumer[Integer, String]): Unit = { + getShuffleIdForWriterCallback = Some(callback) + } + // expecting celeborn shuffle id and application shuffle identifier + @volatile private var getShuffleIdForReaderCallback: Option[BiConsumer[Integer, String]] = None + def registerGetShuffleIdForReaderCallback(callback: BiConsumer[Integer, String]): Unit = { + getShuffleIdForReaderCallback = Some(callback) + } + def registerAppShuffleDeterminate(appShuffleId: Int, determinate: Boolean): Unit = { appShuffleDeterminateMap.put(appShuffleId, determinate) } @@ -1931,4 +1947,6 @@ class LifecycleManager(val appUniqueId: String, val conf: CelebornConf) extends case _ => } } + + def getShuffleIdMapping = shuffleIdMapping } diff --git a/client/src/main/scala/org/apache/celeborn/client/commit/ReducePartitionCommitHandler.scala b/client/src/main/scala/org/apache/celeborn/client/commit/ReducePartitionCommitHandler.scala index 5bdd1c550c1..810418931ca 100644 --- a/client/src/main/scala/org/apache/celeborn/client/commit/ReducePartitionCommitHandler.scala +++ b/client/src/main/scala/org/apache/celeborn/client/commit/ReducePartitionCommitHandler.scala @@ -69,8 +69,8 @@ class ReducePartitionCommitHandler( class MultiSerdeVersionRpcContext(val ctx: RpcCallContext, val serdeVersion: SerdeVersion) {} private val getReducerFileGroupRequest = - JavaUtils.newConcurrentHashMap[Int, util.Set[MultiSerdeVersionRpcContext]]() - private val dataLostShuffleSet = ConcurrentHashMap.newKeySet[Int]() + JavaUtils.newConcurrentHashMap[Int, util.Set[RpcCallContext]]() + private[celeborn] val dataLostShuffleSet = ConcurrentHashMap.newKeySet[Int]() private val stageEndShuffleSet = ConcurrentHashMap.newKeySet[Int]() private val inProcessStageEndShuffleSet = ConcurrentHashMap.newKeySet[Int]() private val shuffleMapperAttempts = JavaUtils.newConcurrentHashMap[Int, Array[Int]]() diff --git a/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala b/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala index d33028a9ab5..556d269f023 100644 --- a/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala +++ b/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala @@ -997,6 +997,7 @@ class CelebornConf(loadDefaults: Boolean) extends Cloneable with Logging with Se def clientFetchMaxRetriesForEachReplica: Int = get(CLIENT_FETCH_MAX_RETRIES_FOR_EACH_REPLICA) def clientStageRerunEnabled: Boolean = get(CLIENT_STAGE_RERUN_ENABLED) + def clientFetchCleanFailedShuffle: Boolean = get(CLIENT_FETCH_CLEAN_FAILED_SHUFFLE) def clientFetchExcludeWorkerOnFailureEnabled: Boolean = get(CLIENT_FETCH_EXCLUDE_WORKER_ON_FAILURE_ENABLED) def clientFetchExcludedWorkerExpireTimeout: Long = @@ -4813,6 +4814,14 @@ object CelebornConf extends Logging { .booleanConf .createWithDefault(true) + val CLIENT_FETCH_CLEAN_FAILED_SHUFFLE: ConfigEntry[Boolean] = + buildConf("celeborn.client.spark.fetch.cleanFailedShuffle") + .categories("client") + .version("0.4.1") + .doc("whether to clean those disk space occupied by shuffles which cannot be fetched") + .booleanConf + .createWithDefault(false) + val CLIENT_FETCH_EXCLUDE_WORKER_ON_FAILURE_ENABLED: ConfigEntry[Boolean] = buildConf("celeborn.client.fetch.excludeWorkerOnFailure.enabled") .categories("client") From 1bf53bda8fc7b95c10736cafaf2253dccc1af719 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Tue, 18 Feb 2025 16:38:01 -0800 Subject: [PATCH 002/120] fetch failure suite --- .../celeborn/spark/FailedShuffleCleaner.scala | 8 +- .../spark/scheduler/RunningStageManager.scala | 2 +- .../spark/shuffle/celeborn/SparkUtils.java | 4 +- .../CelebornFetchFailureDiskCleanSuite.scala | 20 +++ .../spark/CelebornFetchFailureSuite.scala | 109 +++------------ .../celeborn/tests/spark/SparkTestBase.scala | 42 ------ .../fetch_failure/FetchFailureTestBase.scala | 57 ++++++++ .../fetch_failure/ShuffleReaderGetHooks.scala | 125 ++++++++++++++++++ .../TestRunningStageManager.scala | 39 ++++++ .../scheduler/SparkSchedulerHelper.scala | 26 ++++ 10 files changed, 290 insertions(+), 142 deletions(-) create mode 100644 tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala create mode 100644 tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureTestBase.scala create mode 100644 tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/ShuffleReaderGetHooks.scala create mode 100644 tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/TestRunningStageManager.scala create mode 100644 tests/spark-it/src/test/scala/org/apache/spark/scheduler/SparkSchedulerHelper.scala diff --git a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala index c0827c6e6d1..58570c4de06 100644 --- a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala +++ b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala @@ -18,10 +18,12 @@ package org.apache.celeborn.spark import java.util import java.util.concurrent.{ConcurrentHashMap, LinkedBlockingQueue} import java.util.concurrent.atomic.AtomicReference + import scala.collection.JavaConverters._ import scala.collection.mutable import org.apache.spark.scheduler.{RunningStageManager, RunningStageManagerImpl} + import org.apache.celeborn.client.LifecycleManager import org.apache.celeborn.common.internal.Logging @@ -82,8 +84,8 @@ private[celeborn] object FailedShuffleCleaner extends Logging { } def addShuffleIdToBeCleaned( - celebornShuffleId: Int, - appShuffleIdentifier: String): Unit = { + celebornShuffleId: Int, + appShuffleIdentifier: String): Unit = { val Array(appShuffleId, stageId, _) = appShuffleIdentifier.split('-') lifecycleManager.get().getShuffleIdMapping.get(appShuffleId.toInt).foreach { case (pastAppShuffleIdentifier, (celebornShuffleId, _)) => { @@ -145,4 +147,4 @@ private[celeborn] object FailedShuffleCleaner extends Logging { cleanerThread.setName("shuffle cleaner thread") cleanerThread.setDaemon(true) cleanerThread.start() -} \ No newline at end of file +} diff --git a/client-spark/common/src/main/scala/org/apache/spark/scheduler/RunningStageManager.scala b/client-spark/common/src/main/scala/org/apache/spark/scheduler/RunningStageManager.scala index 0e3e91eeb26..daa9688c4e9 100644 --- a/client-spark/common/src/main/scala/org/apache/spark/scheduler/RunningStageManager.scala +++ b/client-spark/common/src/main/scala/org/apache/spark/scheduler/RunningStageManager.scala @@ -27,4 +27,4 @@ class RunningStageManagerImpl extends RunningStageManager { override def isRunningStage(stageId: Int): Boolean = { dagScheduler.runningStages.map(_.id).contains(stageId) } -} \ No newline at end of file +} diff --git a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java index 260df242c1b..14d6de54475 100644 --- a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java +++ b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java @@ -29,8 +29,6 @@ import java.util.concurrent.atomic.LongAdder; import java.util.stream.Collectors; -import org.apache.celeborn.client.LifecycleManager; -import org.apache.celeborn.spark.FailedShuffleCleaner; import scala.Option; import scala.Some; import scala.Tuple2; @@ -66,6 +64,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.celeborn.client.LifecycleManager; import org.apache.celeborn.client.ShuffleClient; import org.apache.celeborn.common.CelebornConf; import org.apache.celeborn.common.exception.CelebornRuntimeException; @@ -77,6 +76,7 @@ import org.apache.celeborn.reflect.DynConstructors; import org.apache.celeborn.reflect.DynFields; import org.apache.celeborn.reflect.DynMethods; +import org.apache.celeborn.spark.FailedShuffleCleaner; public class SparkUtils { private static final Logger LOG = LoggerFactory.getLogger(SparkUtils.class); diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala new file mode 100644 index 00000000000..0bb3b785314 --- /dev/null +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala @@ -0,0 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.celeborn.tests.spark + +class CelebornFetchFailureDiskCleanSuite {} diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureSuite.scala index dd0f3840149..a755b1581f1 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureSuite.scala @@ -30,9 +30,10 @@ import org.scalatest.funsuite.AnyFunSuite import org.apache.celeborn.client.ShuffleClient import org.apache.celeborn.common.protocol.ShuffleMode +import org.apache.celeborn.tests.spark.fetch_failure.{FetchFailureTestBase, FileDeletionShuffleReaderGetHook} class CelebornFetchFailureSuite extends AnyFunSuite - with SparkTestBase + with FetchFailureTestBase with BeforeAndAfterEach { override def beforeEach(): Unit = { @@ -45,19 +46,10 @@ class CelebornFetchFailureSuite extends AnyFunSuite test("celeborn spark integration test - Fetch Failure") { if (Spark3OrNewer) { - val sparkConf = new SparkConf().setAppName("rss-demo").setMaster("local[2,3]") - val sparkSession = SparkSession.builder() - .config(updateSparkConf(sparkConf, ShuffleMode.HASH)) - .config("spark.sql.shuffle.partitions", 2) - .config("spark.celeborn.shuffle.forceFallback.partition.enabled", false) - .config("spark.celeborn.client.spark.stageRerun.enabled", "true") - .config( - "spark.shuffle.manager", - "org.apache.spark.shuffle.celeborn.TestCelebornShuffleManager") - .getOrCreate() + val sparkSession = createSparkSession() val celebornConf = SparkUtils.fromSparkConf(sparkSession.sparkContext.getConf) - val hook = new ShuffleReaderFetchFailureGetHook(celebornConf) + val hook = new FileDeletionShuffleReaderGetHook(celebornConf, workerDirs) TestCelebornShuffleManager.registerReaderGetHook(hook) val value = Range(1, 10000).mkString(",") @@ -86,13 +78,7 @@ class CelebornFetchFailureSuite extends AnyFunSuite test("celeborn spark integration test - unregister shuffle with throwsFetchFailure disabled") { if (Spark3OrNewer) { - val sparkConf = new SparkConf().setAppName("rss-demo").setMaster("local[2,3]") - val sparkSession = SparkSession.builder() - .config(updateSparkConf(sparkConf, ShuffleMode.HASH)) - .config("spark.sql.shuffle.partitions", 2) - .config("spark.celeborn.shuffle.forceFallback.partition.enabled", false) - .config("spark.celeborn.client.spark.stageRerun.enabled", "false") - .getOrCreate() + val sparkSession = createSparkSession() val value = Range(1, 10000).mkString(",") val tuples = sparkSession.sparkContext.parallelize(1 to 10000, 2) @@ -118,19 +104,10 @@ class CelebornFetchFailureSuite extends AnyFunSuite test("celeborn spark integration test - Fetch Failure with multiple shuffle data") { if (Spark3OrNewer) { - val sparkConf = new SparkConf().setAppName("rss-demo").setMaster("local[2,3]") - val sparkSession = SparkSession.builder() - .config(updateSparkConf(sparkConf, ShuffleMode.HASH)) - .config("spark.sql.shuffle.partitions", 2) - .config("spark.celeborn.shuffle.forceFallback.partition.enabled", false) - .config("spark.celeborn.client.spark.stageRerun.enabled", "true") - .config( - "spark.shuffle.manager", - "org.apache.spark.shuffle.celeborn.TestCelebornShuffleManager") - .getOrCreate() + val sparkSession = createSparkSession() val celebornConf = SparkUtils.fromSparkConf(sparkSession.sparkContext.getConf) - val hook = new ShuffleReaderFetchFailureGetHook(celebornConf) + val hook = new FileDeletionShuffleReaderGetHook(celebornConf, workerDirs) TestCelebornShuffleManager.registerReaderGetHook(hook) import sparkSession.implicits._ @@ -149,19 +126,10 @@ class CelebornFetchFailureSuite extends AnyFunSuite test("celeborn spark integration test - Fetch Failure with RDD reuse") { if (Spark3OrNewer) { - val sparkConf = new SparkConf().setAppName("rss-demo").setMaster("local[2,3]") - val sparkSession = SparkSession.builder() - .config(updateSparkConf(sparkConf, ShuffleMode.HASH)) - .config("spark.sql.shuffle.partitions", 2) - .config("spark.celeborn.shuffle.forceFallback.partition.enabled", false) - .config("spark.celeborn.client.spark.stageRerun.enabled", "true") - .config( - "spark.shuffle.manager", - "org.apache.spark.shuffle.celeborn.TestCelebornShuffleManager") - .getOrCreate() + val sparkSession = createSparkSession() val celebornConf = SparkUtils.fromSparkConf(sparkSession.sparkContext.getConf) - val hook = new ShuffleReaderFetchFailureGetHook(celebornConf) + val hook = new FileDeletionShuffleReaderGetHook(celebornConf, workerDirs) TestCelebornShuffleManager.registerReaderGetHook(hook) val sc = sparkSession.sparkContext @@ -189,19 +157,10 @@ class CelebornFetchFailureSuite extends AnyFunSuite test("celeborn spark integration test - Fetch Failure with read write shuffles in one stage") { if (Spark3OrNewer) { - val sparkConf = new SparkConf().setAppName("rss-demo").setMaster("local[2,3]") - val sparkSession = SparkSession.builder() - .config(updateSparkConf(sparkConf, ShuffleMode.HASH)) - .config("spark.sql.shuffle.partitions", 2) - .config("spark.celeborn.shuffle.forceFallback.partition.enabled", false) - .config("spark.celeborn.client.spark.stageRerun.enabled", "true") - .config( - "spark.shuffle.manager", - "org.apache.spark.shuffle.celeborn.TestCelebornShuffleManager") - .getOrCreate() + val sparkSession = createSparkSession() val celebornConf = SparkUtils.fromSparkConf(sparkSession.sparkContext.getConf) - val hook = new ShuffleReaderFetchFailureGetHook(celebornConf) + val hook = new FileDeletionShuffleReaderGetHook(celebornConf, workerDirs) TestCelebornShuffleManager.registerReaderGetHook(hook) val sc = sparkSession.sparkContext @@ -220,13 +179,7 @@ class CelebornFetchFailureSuite extends AnyFunSuite test("celeborn spark integration test - empty shuffle data") { if (Spark3OrNewer) { - val sparkConf = new SparkConf().setAppName("rss-demo").setMaster("local[2,3]") - val sparkSession = SparkSession.builder() - .config(updateSparkConf(sparkConf, ShuffleMode.HASH)) - .config("spark.sql.shuffle.partitions", 2) - .config("spark.celeborn.shuffle.forceFallback.partition.enabled", false) - .config("spark.celeborn.client.spark.stageRerun.enabled", "true") - .getOrCreate() + val sparkSession = createSparkSession(overrideShuffleMgr = false) sparkSession.sql("create table if not exists t_1 (a bigint) using parquet") sparkSession.sql("create table if not exists t_2 (a bigint) using parquet") @@ -241,17 +194,7 @@ class CelebornFetchFailureSuite extends AnyFunSuite } test(s"celeborn spark integration test - resubmit an unordered barrier stage with throwsFetchFailure enabled") { - val sparkConf = new SparkConf().setAppName("rss-demo").setMaster("local[2]") - val sparkSession = SparkSession.builder() - .config(updateSparkConf(sparkConf, ShuffleMode.HASH)) - .config("spark.sql.shuffle.partitions", 2) - .config("spark.celeborn.shuffle.forceFallback.partition.enabled", false) - .config("spark.celeborn.client.spark.stageRerun.enabled", "true") - .config("spark.celeborn.client.push.buffer.max.size", 0) - .config( - "spark.shuffle.manager", - "org.apache.spark.shuffle.celeborn.TestCelebornShuffleManager") - .getOrCreate() + val sparkSession = createSparkSession(overrideShuffleMgr = false) try { val sc = sparkSession.sparkContext @@ -285,17 +228,7 @@ class CelebornFetchFailureSuite extends AnyFunSuite } test(s"celeborn spark integration test - fetch failure in child of an unordered barrier stage with throwsFetchFailure enabled") { - val sparkConf = new SparkConf().setAppName("rss-demo").setMaster("local[2]") - val sparkSession = SparkSession.builder() - .config(updateSparkConf(sparkConf, ShuffleMode.HASH)) - .config("spark.sql.shuffle.partitions", 2) - .config("spark.celeborn.shuffle.forceFallback.partition.enabled", false) - .config("spark.celeborn.client.spark.stageRerun.enabled", "true") - .config("spark.celeborn.client.push.buffer.max.size", 0) - .config( - "spark.shuffle.manager", - "org.apache.spark.shuffle.celeborn.TestCelebornShuffleManager") - .getOrCreate() + val sparkSession = createSparkSession(overrideShuffleMgr = false) try { val sc = sparkSession.sparkContext @@ -331,19 +264,7 @@ class CelebornFetchFailureSuite extends AnyFunSuite } test(s"celeborn spark integration test - resubmit a failed barrier stage across jobs") { - val sparkConf = new SparkConf().setAppName("rss-demo").setMaster("local[2]") - val sparkSession = SparkSession.builder() - .config(updateSparkConf(sparkConf, ShuffleMode.HASH)) - .config("spark.sql.shuffle.partitions", 2) - .config("spark.celeborn.shuffle.forceFallback.partition.enabled", false) - .config("spark.celeborn.client.spark.stageRerun.enabled", "true") - .config("spark.celeborn.client.push.buffer.max.size", 0) - .config("spark.stage.maxConsecutiveAttempts", "1") - .config("spark.stage.maxAttempts", "1") - .config( - "spark.shuffle.manager", - "org.apache.spark.shuffle.celeborn.TestCelebornShuffleManager") - .getOrCreate() + val sparkSession = createSparkSession(overrideShuffleMgr = false) // trigger failure CelebornFetchFailureSuite.triggerFailure.set(true) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/SparkTestBase.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/SparkTestBase.scala index e29b21a0c71..41cbe072b32 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/SparkTestBase.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/SparkTestBase.scala @@ -116,46 +116,4 @@ trait SparkTestBase extends AnyFunSuite val outMap = result.collect().map(row => row.getString(0) -> row.getLong(1)).toMap outMap } - - class ShuffleReaderFetchFailureGetHook(conf: CelebornConf) extends ShuffleManagerHook { - var executed: AtomicBoolean = new AtomicBoolean(false) - val lock = new Object - - override def exec( - handle: ShuffleHandle, - startPartition: Int, - endPartition: Int, - context: TaskContext): Unit = { - if (executed.get() == true) return - - lock.synchronized { - handle match { - case h: CelebornShuffleHandle[_, _, _] => { - val appUniqueId = h.appUniqueId - val shuffleClient = ShuffleClient.get( - h.appUniqueId, - h.lifecycleManagerHost, - h.lifecycleManagerPort, - conf, - h.userIdentifier, - h.extension) - val celebornShuffleId = - SparkUtils.celebornShuffleId(shuffleClient, h, context, false) - val allFiles = workerDirs.map(dir => { - new File(s"$dir/celeborn-worker/shuffle_data/$appUniqueId/$celebornShuffleId") - }) - val datafile = allFiles.filter(_.exists()) - .flatMap(_.listFiles().iterator).sortBy(_.getName).headOption - datafile match { - case Some(file) => file.delete() - case None => throw new RuntimeException("unexpected, there must be some data file" + - s" under ${workerDirs.mkString(",")}") - } - } - case _ => throw new RuntimeException("unexpected, only support RssShuffleHandle here") - } - executed.set(true) - } - } - } } diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureTestBase.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureTestBase.scala new file mode 100644 index 00000000000..c6363cc7411 --- /dev/null +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureTestBase.scala @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.celeborn.tests.spark.fetch_failure + +import org.apache.spark.SparkConf +import org.apache.spark.sql.SparkSession + +import org.apache.celeborn.common.protocol.ShuffleMode +import org.apache.celeborn.tests.spark.SparkTestBase + +private[tests] trait FetchFailureTestBase extends SparkTestBase { + + def createSparkSession( + overrideShuffleMgr: Boolean = true, + enableFailedShuffleCleaner: Boolean = false): SparkSession = { + val sparkConf = new SparkConf().setAppName("rss-demo").setMaster("local[2,3]") + new SparkConf().setAppName("rss-demo").setMaster("local[2,3]") + var baseBuilder = SparkSession.builder() + .config(updateSparkConf(sparkConf, ShuffleMode.HASH)) + .config("spark.sql.shuffle.partitions", 2) + .config("spark.celeborn.shuffle.forceFallback.partition.enabled", false) + .config("spark.celeborn.shuffle.enabled", "true") + .config("spark.celeborn.client.shuffle.expired.checkInterval", "1s") + .config("spark.kryoserializer.buffer.max", "2047m") + .config("spark.celeborn.client.spark.fetch.throwsFetchFailure", "true") + baseBuilder = + if (overrideShuffleMgr) { + baseBuilder.config( + "spark.shuffle.manager", + "org.apache.spark.shuffle.celeborn.TestCelebornShuffleManager") + } else { + baseBuilder + } + baseBuilder = + if (enableFailedShuffleCleaner) { + baseBuilder.config("spark.celeborn.client.spark.fetch.cleanFailedShuffle", "true") + } else { + baseBuilder + } + baseBuilder.getOrCreate() + } +} diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/ShuffleReaderGetHooks.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/ShuffleReaderGetHooks.scala new file mode 100644 index 00000000000..cf22e1bcaf2 --- /dev/null +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/ShuffleReaderGetHooks.scala @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.celeborn.tests.spark.fetch_failure + +import java.io.File +import java.util.concurrent.atomic.AtomicBoolean + +import org.apache.spark.{SparkEnv, TaskContext} +import org.apache.spark.shuffle.ShuffleHandle +import org.apache.spark.shuffle.celeborn.{CelebornShuffleHandle, ShuffleManagerHook, SparkShuffleManager, SparkUtils, TestCelebornShuffleManager} + +import org.apache.celeborn.client.{LifecycleManager, ShuffleClient} +import org.apache.celeborn.client.commit.ReducePartitionCommitHandler +import org.apache.celeborn.common.CelebornConf + +class FailCommitShuffleReaderGetHook( + conf: CelebornConf) + extends ShuffleManagerHook { + var executed: AtomicBoolean = new AtomicBoolean(false) + val lock = new Object + override def exec( + handle: ShuffleHandle, + startPartition: Int, + endPartition: Int, + context: TaskContext): Unit = { + if (executed.get()) return + lock.synchronized { + // this has to be used in local mode since it leverages that the lifecycle manager + // is in the same process with reader + handle match { + case h: CelebornShuffleHandle[_, _, _] => { + val shuffleClient = ShuffleClient.get( + h.appUniqueId, + h.lifecycleManagerHost, + h.lifecycleManagerPort, + conf, + h.userIdentifier, + h.extension) + val celebornShuffleId = SparkUtils.celebornShuffleId(shuffleClient, h, context, false) + val lifecycleManager = + SparkEnv.get.shuffleManager.asInstanceOf[TestCelebornShuffleManager] + .getLifecycleManager + val commitHandler = lifecycleManager.commitManager.getCommitHandler(celebornShuffleId) + commitHandler.asInstanceOf[ReducePartitionCommitHandler].dataLostShuffleSet.add( + celebornShuffleId) + executed.set(true) + } + case _ => throw new RuntimeException("unexpected, only support RssShuffleHandle here") + } + } + } +} + +class FileDeletionShuffleReaderGetHook( + conf: CelebornConf, + workerDirs: Seq[String], + shuffleIdToBeDeleted: Seq[Int] = Seq(), + triggerStageId: Option[Int] = None) + extends ShuffleManagerHook { + var executed: AtomicBoolean = new AtomicBoolean(false) + val lock = new Object + private def deleteDataFile(appUniqueId: String, celebornShuffleId: Int): Unit = { + val datafile = + workerDirs.map(dir => { + new File(s"$dir/celeborn-worker/shuffle_data/$appUniqueId/$celebornShuffleId") + }).filter(_.exists()) + .flatMap(_.listFiles().iterator).headOption + datafile match { + case Some(file) => { + file.delete() + } + case None => throw new RuntimeException("unexpected, there must be some data file") + } + } + override def exec( + handle: ShuffleHandle, + startPartition: Int, + endPartition: Int, + context: TaskContext): Unit = { + if (executed.get()) return + lock.synchronized { + handle match { + case h: CelebornShuffleHandle[_, _, _] => { + val appUniqueId = h.appUniqueId + val shuffleClient = ShuffleClient.get( + h.appUniqueId, + h.lifecycleManagerHost, + h.lifecycleManagerPort, + conf, + h.userIdentifier, + h.extension) + val celebornShuffleId = SparkUtils.celebornShuffleId(shuffleClient, h, context, false) + val appShuffleIdentifier = SparkUtils.getAppShuffleIdentifier(handle.shuffleId, context) + val Array(_, stageId, _) = appShuffleIdentifier.split('-') + if (triggerStageId.isEmpty || triggerStageId.get == stageId.toInt) { + if (shuffleIdToBeDeleted.isEmpty) { + deleteDataFile(appUniqueId, celebornShuffleId) + } else { + shuffleIdToBeDeleted.foreach { shuffleId => + deleteDataFile(appUniqueId, shuffleId) + } + } + executed.set(true) + } + } + case _ => throw new RuntimeException("unexpected, only support RssShuffleHandle here") + } + } + } +} diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/TestRunningStageManager.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/TestRunningStageManager.scala new file mode 100644 index 00000000000..6a75398a24b --- /dev/null +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/TestRunningStageManager.scala @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.celeborn.tests.spark.fetch_failure + +import scala.collection.mutable + +import org.apache.spark.scheduler.{RunningStageManager, SparkSchedulerHelper} + +class TestRunningStageManager extends RunningStageManager { + import TestRunningStageManager._ + def setRunningStages(stageIds: Seq[Int]): Unit = { + stageIds.foreach(stageId => runningStages += stageId) + } + override def isRunningStage(stageId: Int): Boolean = { + if (runningStages.contains(stageId)) { + println(s"instrumented running stages contains $stageId") + true + } else { + SparkSchedulerHelper.runningStages.map(_.id).contains(stageId) + } + } +} +object TestRunningStageManager { + val runningStages = new mutable.HashSet[Int] +} diff --git a/tests/spark-it/src/test/scala/org/apache/spark/scheduler/SparkSchedulerHelper.scala b/tests/spark-it/src/test/scala/org/apache/spark/scheduler/SparkSchedulerHelper.scala new file mode 100644 index 00000000000..0eafaee935d --- /dev/null +++ b/tests/spark-it/src/test/scala/org/apache/spark/scheduler/SparkSchedulerHelper.scala @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.scheduler + +import org.apache.spark.SparkContext + +object SparkSchedulerHelper { + def dagScheduler = SparkContext.getActive.get.dagScheduler + + def runningStages = dagScheduler.runningStages +} From 4f6acca830c3c6414eab1c2ea53664426704fa36 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Tue, 18 Feb 2025 16:41:26 -0800 Subject: [PATCH 003/120] add disk clean suite --- .../CelebornFetchFailureDiskCleanSuite.scala | 350 +++++++++++++++++- 1 file changed, 348 insertions(+), 2 deletions(-) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala index 0bb3b785314..e149316f181 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala @@ -14,7 +14,353 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.celeborn.tests.spark -class CelebornFetchFailureDiskCleanSuite {} +import java.io.File + +import scala.collection.mutable + +import org.apache.spark.shuffle.celeborn.{SparkUtils, TestCelebornShuffleManager} +import org.apache.spark.sql.SparkSession +import org.scalatest.BeforeAndAfterEach +import org.scalatest.funsuite.AnyFunSuite + +import org.apache.celeborn.client.ShuffleClient +import org.apache.celeborn.service.deploy.worker.Worker +import org.apache.celeborn.spark.FailedShuffleCleaner +import org.apache.celeborn.tests.spark.fetch_failure.{FailCommitShuffleReaderGetHook, FetchFailureTestBase, FileDeletionShuffleReaderGetHook, TestRunningStageManager} + +class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite + with FetchFailureTestBase + with BeforeAndAfterEach { + + override def beforeEach(): Unit = { + ShuffleClient.reset() + FailedShuffleCleaner.reset() + } + + override def afterEach(): Unit = { + System.gc() + } + + override def createWorker(map: Map[String, String]): Worker = { + val storageDir = createTmpDir() + workerDirs = workerDirs :+ storageDir + super.createWorker(map ++ Map("celeborn.master.heartbeat.worker.timeout" -> "10s"), storageDir) + } + + class CheckingThread( + shuffleIdShouldNotExist: Seq[Int], + shuffleIdMustExist: Seq[Int], + sparkSession: SparkSession) + extends Thread { + var exception: Exception = _ + protected def checkDirStatus(): Boolean = { + val deletedSuccessfully = shuffleIdShouldNotExist.forall(shuffleId => { + workerDirs.forall(dir => + !new File(s"$dir/celeborn-worker/shuffle_data/" + + s"${sparkSession.sparkContext.applicationId}/$shuffleId").exists()) + }) + val createdSuccessfully = shuffleIdMustExist.forall(shuffleId => { + workerDirs.exists(dir => + new File(s"$dir/celeborn-worker/shuffle_data/" + + s"${sparkSession.sparkContext.applicationId}/$shuffleId").exists()) + }) + deletedSuccessfully && createdSuccessfully + } + override def run(): Unit = { + var allDataInShape = checkDirStatus() + while (!allDataInShape) { + Thread.sleep(1000) + allDataInShape = checkDirStatus() + } + } + } + class CheckingThreadForStableStatus( + shuffleIdShouldNotExist: Seq[Int], + shuffleIdMustExist: Seq[Int], + sparkSession: SparkSession) + extends CheckingThread(shuffleIdShouldNotExist, shuffleIdMustExist, sparkSession) { + override def run(): Unit = { + val timeout = 60000 + var elapseTime = 0L + var allDataInShape = checkDirStatus() + while (!allDataInShape) { + Thread.sleep(5000) + println("init state not meet") + allDataInShape = checkDirStatus() + } + while (allDataInShape) { + Thread.sleep(5000) + elapseTime += 5000 + if (elapseTime > timeout) { + return + } + allDataInShape = checkDirStatus() + if (!allDataInShape) { + exception = new IllegalStateException("the directory state does not meet" + + " the expected state") + throw exception + } + } + } + } + private def triggerStorageCheckThread( + shuffleIdShouldNotExist: Seq[Int], + shuffleIdMustExist: Seq[Int], + sparkSession: SparkSession, + forStableStatusChecking: Boolean): CheckingThread = { + val checkingThread = + if (!forStableStatusChecking) { + new CheckingThread(shuffleIdShouldNotExist, shuffleIdMustExist, sparkSession) + } else { + new CheckingThreadForStableStatus(shuffleIdShouldNotExist, shuffleIdMustExist, sparkSession) + } + checkingThread.setDaemon(true) + checkingThread.start() + checkingThread + } + private def checkStorageValidation(checkingThread: Thread): Unit = { + checkingThread.join(120 * 1000) + if (checkingThread.isAlive || checkingThread.asInstanceOf[CheckingThread].exception != null) { + throw new IllegalStateException("the storage checking status failed," + + s"${}") + } + } + // 1. for single level 1-1 lineage, the old disk space is cleaned before the spark application + // finish + test("celeborn spark integration test - (1-1 dep with, single level lineage) the failed shuffle file is cleaned up correctly") { + if (runningWithSpark3OrNewer()) { + val sparkSession = createSparkSession(enableFailedShuffleCleaner = true) + val celebornConf = SparkUtils.fromSparkConf(sparkSession.sparkContext.getConf) + val hook = new FileDeletionShuffleReaderGetHook( + celebornConf, + workerDirs, + shuffleIdToBeDeleted = Seq(0)) + TestCelebornShuffleManager.registerReaderGetHook(hook) + val value = Range(1, 10000).mkString(",") + val checkingThread = + triggerStorageCheckThread(Seq(0), Seq(1), sparkSession, forStableStatusChecking = false) + val tuples = sparkSession.sparkContext.parallelize(1 to 10000, 2) + .map { i => (i, value) }.groupByKey(16).collect() + checkStorageValidation(checkingThread) + // verify result + assert(hook.executed.get()) + assert(tuples.length == 10000) + for (elem <- tuples) { + assert(elem._2.mkString(",").equals(value)) + } + sparkSession.stop() + } + } + // 2. for multiple level 1-1 lineage, the old disk space is cleaned one by one + test("celeborn spark integration test - (1-1 dep with, multi-level lineage) the failed shuffle file is cleaned up correctly") { + if (runningWithSpark3OrNewer()) { + val sparkSession = createSparkSession(enableFailedShuffleCleaner = true) + val celebornConf = SparkUtils.fromSparkConf(sparkSession.sparkContext.getConf) + val hook = + new FileDeletionShuffleReaderGetHook( + celebornConf, + workerDirs, + shuffleIdToBeDeleted = Seq(0, 1), + triggerStageId = Some(2)) + TestCelebornShuffleManager.registerReaderGetHook(hook) + val value = Range(1, 10000).mkString(",") + val checkingThread = triggerStorageCheckThread( + Seq(0, 1), + Seq(2, 3, 4), + sparkSession, + forStableStatusChecking = false) + val tuples = sparkSession.sparkContext.parallelize(1 to 10000, 2) + .map { i => (i, value) }.groupByKey(16).map { + case (k, elements) => + (k, elements.map(str => str.toLowerCase)) + }.groupByKey(4).groupByKey(2).collect() + checkStorageValidation(checkingThread) + // verify result + assert(hook.executed.get()) + assert(tuples.length == 10000) + for (elem <- tuples) { + assert(elem._2.flatten.flatten.mkString(",").equals(value)) + } + sparkSession.stop() + } + } + // 3. for single level M-1 lineage, the single failed disk space is cleaned + test( + "celeborn spark integration test - (M-1 dep with single level lineage) the single failed shuffle file is cleaned up correctly") { + if (runningWithSpark3OrNewer()) { + val sparkSession = createSparkSession(enableFailedShuffleCleaner = true) + val celebornConf = SparkUtils.fromSparkConf(sparkSession.sparkContext.getConf) + val hook = new FileDeletionShuffleReaderGetHook( + celebornConf, + workerDirs, + shuffleIdToBeDeleted = Seq(0)) + TestCelebornShuffleManager.registerReaderGetHook(hook) + val checkingThread = + triggerStorageCheckThread(Seq(0), Seq(1, 2), sparkSession, forStableStatusChecking = false) + import sparkSession.implicits._ + val df1 = Seq((1, "a"), (2, "b")).toDF("id", "data").groupBy("id").count() + val df2 = Seq((2, "c"), (2, "d")).toDF("id", "data").groupBy("id").count() + val tuples = df1.hint("merge").join(df2, "id").select("*").collect() + checkStorageValidation(checkingThread) + // verify result + assert(hook.executed.get()) + val expect = "[2,1,2]" + assert(tuples.head.toString().equals(expect)) + sparkSession.stop() + println(s"end ${System.currentTimeMillis()}") + } + } + // 4. for single level M-1 lineage, all failed disk spaces are cleaned + test("celeborn spark integration test - (M-1 dep with single-level lineage) all failed disk spaces are cleaned") { + if (runningWithSpark3OrNewer()) { + val sparkSession = createSparkSession(enableFailedShuffleCleaner = true) + val celebornConf = SparkUtils.fromSparkConf(sparkSession.sparkContext.getConf) + val hook = new FileDeletionShuffleReaderGetHook( + celebornConf, + workerDirs, + shuffleIdToBeDeleted = Seq(0, 1)) + TestCelebornShuffleManager.registerReaderGetHook(hook) + val checkingThread = triggerStorageCheckThread( + Seq(0, 1), + Seq(2, 3), + sparkSession, + forStableStatusChecking = false) + import sparkSession.implicits._ + val df1 = Seq((1, "a"), (2, "b")).toDF("id", "data").groupBy("id").count() + val df2 = Seq((2, "c"), (3, "d")).toDF("id", "data").groupBy("id").count() + val tuples = df1.hint("merge").join(df2, "id").select("*").collect() + checkStorageValidation(checkingThread) + // verify result + assert(hook.executed.get()) + val expect = "[2,1,1]" + println(tuples.head.toString()) + assert(tuples.head.toString().equals(expect)) + sparkSession.stop() + } + } + // 6. for multiple level M - 1 lineage , all failed disk spaces are cleaned + test("celeborn spark integration test - (M-1 dep with multi-level lineage) the failed shuffle files are all cleaned up" + + " correctly") { + if (runningWithSpark3OrNewer()) { + val sparkSession = createSparkSession(enableFailedShuffleCleaner = true) + val celebornConf = SparkUtils.fromSparkConf(sparkSession.sparkContext.getConf) + val hook = new FileDeletionShuffleReaderGetHook( + celebornConf, + workerDirs, + shuffleIdToBeDeleted = Seq(0, 1, 2, 3), + triggerStageId = Some(4)) + TestCelebornShuffleManager.registerReaderGetHook(hook) + val checkingThread = triggerStorageCheckThread( + Seq(0, 1, 2, 3), + Seq(4, 5, 6, 7), + sparkSession, + forStableStatusChecking = false) + import sparkSession.implicits._ + val df1 = Seq((1, "a"), (2, "b")).toDF("id", "data").groupBy("id").count() + .withColumnRenamed("count", "countId").groupBy("countId").count() + .withColumnRenamed("count", "df1_count") + val df2 = Seq((2, "c"), (3, "d")).toDF("id", "data").groupBy("id").count() + .withColumnRenamed("count", "countId").groupBy("countId").count() + .withColumnRenamed("count", "df2_count") + val tuples = df1.hint("merge").join(df2, "countId").select("*").collect() + checkStorageValidation(checkingThread) + // verify result + assert(hook.executed.get()) + val expect = "[1,2,2]" + assert(tuples.head.toString().equals(expect)) + sparkSession.stop() + } + } + + // 7. if the dependency is 1 to M , we should not clean it + test("celeborn spark integration test - Do not clean up the shuffle files being referred by more than one stages") { + if (runningWithSpark3OrNewer()) { + System.setProperty( + FailedShuffleCleaner.RUNNING_STAGE_CHECKER_CLASS, + "org.apache.celeborn.tests.spark.fetch_failure.TestRunningStageManager") + FailedShuffleCleaner.reset() + // create dummy running stages + TestRunningStageManager.runningStages += 2 + FailedShuffleCleaner.celebornShuffleIdToReferringStages.put(0, new mutable.HashSet[Int]) + FailedShuffleCleaner.celebornShuffleIdToReferringStages.get(0) += 2 + val sparkSession = createSparkSession(enableFailedShuffleCleaner = true) + val celebornConf = SparkUtils.fromSparkConf(sparkSession.sparkContext.getConf) + val hook = new FileDeletionShuffleReaderGetHook( + celebornConf, + workerDirs, + shuffleIdToBeDeleted = Seq(0)) + TestCelebornShuffleManager.registerReaderGetHook(hook) + val checkingThread = + triggerStorageCheckThread(Seq(), Seq(0, 1), sparkSession, forStableStatusChecking = true) + import sparkSession.implicits._ + val df1 = Seq((1, "a"), (2, "b")).toDF("id", "data").groupBy("id").count() + val tuple = df1.collect().map(r => r.getAs[Int]("id")) + checkStorageValidation(checkingThread) + // verify result + assert(hook.executed.get()) + val expect = "[2,1]" + assert(tuple.mkString("[", ",", "]").equals(expect)) + sparkSession.stop() + } + } + // 8. if the dependency is 1 to M but failed in commit phase, we should just clean it + test("celeborn spark integration test - clear the failed-to-commit shuffle file even it is referred by more than once") { + if (runningWithSpark3OrNewer()) { + System.setProperty( + FailedShuffleCleaner.RUNNING_STAGE_CHECKER_CLASS, + "org.apache.celeborn.tests.spark.fetch_failure.TestRunningStageManager") + FailedShuffleCleaner.reset() + // create dummy running stages + TestRunningStageManager.runningStages += 2 + FailedShuffleCleaner.celebornShuffleIdToReferringStages.put(0, new mutable.HashSet[Int]) + FailedShuffleCleaner.celebornShuffleIdToReferringStages.get(0) += 2 + val sparkSession = createSparkSession(enableFailedShuffleCleaner = true) + val celebornConf = SparkUtils.fromSparkConf(sparkSession.sparkContext.getConf) + val hook = new FailCommitShuffleReaderGetHook(celebornConf) + TestCelebornShuffleManager.registerReaderGetHook(hook) + val checkingThread = + triggerStorageCheckThread(Seq(0, 2), Seq(1), sparkSession, forStableStatusChecking = true) + import sparkSession.implicits._ + val df1 = Seq((1, "a"), (2, "b")).toDF("id", "data").groupBy("id").count() + val tuples = df1.collect().map(r => r.getAs[Int]("id")) + checkStorageValidation(checkingThread) + // verify result + assert(hook.executed.get()) + val expect = "[2,1]" + assert(tuples.mkString("[", ",", "]").equals(expect)) + sparkSession.stop() + } + } + test("celeborn spark integration test - clean up the shuffle files if" + + " the referring stage has finished") { + if (runningWithSpark3OrNewer()) { + System.setProperty( + FailedShuffleCleaner.RUNNING_STAGE_CHECKER_CLASS, + "org.apache.celeborn.tests.spark.fetch_failure.TestRunningStageManager") + FailedShuffleCleaner.reset() + // create dummy running stages + FailedShuffleCleaner.celebornShuffleIdToReferringStages.put(0, new mutable.HashSet[Int]) + FailedShuffleCleaner.celebornShuffleIdToReferringStages.get(0) += 2 + val sparkSession = createSparkSession(enableFailedShuffleCleaner = true) + val celebornConf = SparkUtils.fromSparkConf(sparkSession.sparkContext.getConf) + val hook = new FileDeletionShuffleReaderGetHook( + celebornConf, + workerDirs, + shuffleIdToBeDeleted = Seq(0)) + TestCelebornShuffleManager.registerReaderGetHook(hook) + val checkingThread = + triggerStorageCheckThread(Seq(), Seq(1), sparkSession, forStableStatusChecking = true) + import sparkSession.implicits._ + val df1 = Seq((1, "a"), (2, "b")).toDF("id", "data").groupBy("id").count() + val tuple = df1.collect().map(r => r.getAs[Int]("id")) + checkStorageValidation(checkingThread) + // verify result + assert(hook.executed.get()) + val expect = "[2,1]" + assert(tuple.mkString("[", ",", "]").equals(expect)) + sparkSession.stop() + } + } +} From 935f11b53058f0a79cf0038e06801ba467b416ae Mon Sep 17 00:00:00 2001 From: CodingCat Date: Tue, 18 Feb 2025 16:51:03 -0800 Subject: [PATCH 004/120] fix compilation error --- .../celeborn/spark/FailedShuffleCleaner.scala | 2 +- .../CelebornFetchFailureDiskCleanSuite.scala | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala index 58570c4de06..4596e91dbc2 100644 --- a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala +++ b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala @@ -68,7 +68,7 @@ private[celeborn] object FailedShuffleCleaner extends Logging { val stageIds = celebornShuffleIdToReferringStages.computeIfAbsent( celebornShuffleId, - (_) => new mutable.HashSet[Int]()) + (_: Int) => new mutable.HashSet[Int]()) lock.synchronized { stageIds.add(stageId.toInt) } diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala index e149316f181..49404cd1926 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala @@ -130,7 +130,7 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite // 1. for single level 1-1 lineage, the old disk space is cleaned before the spark application // finish test("celeborn spark integration test - (1-1 dep with, single level lineage) the failed shuffle file is cleaned up correctly") { - if (runningWithSpark3OrNewer()) { + if (Spark3OrNewer) { val sparkSession = createSparkSession(enableFailedShuffleCleaner = true) val celebornConf = SparkUtils.fromSparkConf(sparkSession.sparkContext.getConf) val hook = new FileDeletionShuffleReaderGetHook( @@ -155,7 +155,7 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite } // 2. for multiple level 1-1 lineage, the old disk space is cleaned one by one test("celeborn spark integration test - (1-1 dep with, multi-level lineage) the failed shuffle file is cleaned up correctly") { - if (runningWithSpark3OrNewer()) { + if (Spark3OrNewer) { val sparkSession = createSparkSession(enableFailedShuffleCleaner = true) val celebornConf = SparkUtils.fromSparkConf(sparkSession.sparkContext.getConf) val hook = @@ -189,7 +189,7 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite // 3. for single level M-1 lineage, the single failed disk space is cleaned test( "celeborn spark integration test - (M-1 dep with single level lineage) the single failed shuffle file is cleaned up correctly") { - if (runningWithSpark3OrNewer()) { + if (Spark3OrNewer) { val sparkSession = createSparkSession(enableFailedShuffleCleaner = true) val celebornConf = SparkUtils.fromSparkConf(sparkSession.sparkContext.getConf) val hook = new FileDeletionShuffleReaderGetHook( @@ -214,7 +214,7 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite } // 4. for single level M-1 lineage, all failed disk spaces are cleaned test("celeborn spark integration test - (M-1 dep with single-level lineage) all failed disk spaces are cleaned") { - if (runningWithSpark3OrNewer()) { + if (Spark3OrNewer) { val sparkSession = createSparkSession(enableFailedShuffleCleaner = true) val celebornConf = SparkUtils.fromSparkConf(sparkSession.sparkContext.getConf) val hook = new FileDeletionShuffleReaderGetHook( @@ -243,7 +243,7 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite // 6. for multiple level M - 1 lineage , all failed disk spaces are cleaned test("celeborn spark integration test - (M-1 dep with multi-level lineage) the failed shuffle files are all cleaned up" + " correctly") { - if (runningWithSpark3OrNewer()) { + if (Spark3OrNewer) { val sparkSession = createSparkSession(enableFailedShuffleCleaner = true) val celebornConf = SparkUtils.fromSparkConf(sparkSession.sparkContext.getConf) val hook = new FileDeletionShuffleReaderGetHook( @@ -276,7 +276,7 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite // 7. if the dependency is 1 to M , we should not clean it test("celeborn spark integration test - Do not clean up the shuffle files being referred by more than one stages") { - if (runningWithSpark3OrNewer()) { + if (Spark3OrNewer) { System.setProperty( FailedShuffleCleaner.RUNNING_STAGE_CHECKER_CLASS, "org.apache.celeborn.tests.spark.fetch_failure.TestRunningStageManager") @@ -307,7 +307,7 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite } // 8. if the dependency is 1 to M but failed in commit phase, we should just clean it test("celeborn spark integration test - clear the failed-to-commit shuffle file even it is referred by more than once") { - if (runningWithSpark3OrNewer()) { + if (Spark3OrNewer) { System.setProperty( FailedShuffleCleaner.RUNNING_STAGE_CHECKER_CLASS, "org.apache.celeborn.tests.spark.fetch_failure.TestRunningStageManager") @@ -335,7 +335,7 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite } test("celeborn spark integration test - clean up the shuffle files if" + " the referring stage has finished") { - if (runningWithSpark3OrNewer()) { + if (Spark3OrNewer) { System.setProperty( FailedShuffleCleaner.RUNNING_STAGE_CHECKER_CLASS, "org.apache.celeborn.tests.spark.fetch_failure.TestRunningStageManager") From 36af3f394e0d302cc65f0d7b2c9b5d5219cda5ab Mon Sep 17 00:00:00 2001 From: CodingCat Date: Tue, 18 Feb 2025 16:59:52 -0800 Subject: [PATCH 005/120] continue fixing compilation error --- .../org/apache/spark/shuffle/celeborn/SparkUtilsSuite.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/spark-it/src/test/scala/org/apache/spark/shuffle/celeborn/SparkUtilsSuite.scala b/tests/spark-it/src/test/scala/org/apache/spark/shuffle/celeborn/SparkUtilsSuite.scala index 83a5e12f60b..8ded642eb7b 100644 --- a/tests/spark-it/src/test/scala/org/apache/spark/shuffle/celeborn/SparkUtilsSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/spark/shuffle/celeborn/SparkUtilsSuite.scala @@ -33,6 +33,7 @@ import org.apache.celeborn.common.protocol.{PartitionLocation, ShuffleMode} import org.apache.celeborn.common.protocol.message.ControlMessages.GetReducerFileGroupResponse import org.apache.celeborn.common.protocol.message.StatusCode import org.apache.celeborn.tests.spark.SparkTestBase +import org.apache.celeborn.tests.spark.fetch_failure.FileDeletionShuffleReaderGetHook class SparkUtilsSuite extends AnyFunSuite with SparkTestBase @@ -60,7 +61,7 @@ class SparkUtilsSuite extends AnyFunSuite .getOrCreate() val celebornConf = SparkUtils.fromSparkConf(sparkSession.sparkContext.getConf) - val hook = new ShuffleReaderFetchFailureGetHook(celebornConf) + val hook = new FileDeletionShuffleReaderGetHook(celebornConf) TestCelebornShuffleManager.registerReaderGetHook(hook) try { From 3599e47e70a1c9032db6e9bdfe470063ecd05650 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Tue, 18 Feb 2025 17:04:51 -0800 Subject: [PATCH 006/120] fix compilation error --- .../org/apache/spark/shuffle/celeborn/SparkUtilsSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/spark-it/src/test/scala/org/apache/spark/shuffle/celeborn/SparkUtilsSuite.scala b/tests/spark-it/src/test/scala/org/apache/spark/shuffle/celeborn/SparkUtilsSuite.scala index 8ded642eb7b..618b204b753 100644 --- a/tests/spark-it/src/test/scala/org/apache/spark/shuffle/celeborn/SparkUtilsSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/spark/shuffle/celeborn/SparkUtilsSuite.scala @@ -61,7 +61,7 @@ class SparkUtilsSuite extends AnyFunSuite .getOrCreate() val celebornConf = SparkUtils.fromSparkConf(sparkSession.sparkContext.getConf) - val hook = new FileDeletionShuffleReaderGetHook(celebornConf) + val hook = new FileDeletionShuffleReaderGetHook(celebornConf, workerDirs) TestCelebornShuffleManager.registerReaderGetHook(hook) try { From 23a76fac69d1aa4838508e417c14424c4e1ff484 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Tue, 18 Feb 2025 17:12:34 -0800 Subject: [PATCH 007/120] param doc --- docs/configuration/client.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/configuration/client.md b/docs/configuration/client.md index 6c0ff752d3e..45fefb67bf1 100644 --- a/docs/configuration/client.md +++ b/docs/configuration/client.md @@ -111,6 +111,7 @@ license: | | celeborn.client.shuffle.register.filterExcludedWorker.enabled | false | false | Whether to filter excluded worker when register shuffle. | 0.4.0 | | | celeborn.client.shuffle.reviseLostShuffles.enabled | false | false | Whether to revise lost shuffles. | 0.6.0 | | | celeborn.client.slot.assign.maxWorkers | 10000 | false | Max workers that slots of one shuffle can be allocated on. Will choose the smaller positive one from Master side and Client side, see `celeborn.master.slot.assign.maxWorkers`. | 0.3.1 | | +| celeborn.client.spark.fetch.cleanFailedShuffle | false | false | whether to clean those disk space occupied by shuffles which cannot be fetched | 0.4.1 | | | celeborn.client.spark.push.dynamicWriteMode.enabled | false | false | Whether to dynamically switch push write mode based on conditions.If true, shuffle mode will be only determined by partition count | 0.5.0 | | | celeborn.client.spark.push.dynamicWriteMode.partitionNum.threshold | 2000 | false | Threshold of shuffle partition number for dynamically switching push writer mode. When the shuffle partition number is greater than this value, use the sort-based shuffle writer for memory efficiency; otherwise use the hash-based shuffle writer for speed. This configuration only takes effect when celeborn.client.spark.push.dynamicWriteMode.enabled is true. | 0.5.0 | | | celeborn.client.spark.push.sort.memory.maxMemoryFactor | 0.4 | false | the max portion of executor memory which can be used for SortBasedWriter buffer (only valid when celeborn.client.spark.push.sort.memory.useAdaptiveThreshold is enabled | 0.5.0 | | From 8932e4b2a6ebb867d30ecf052a42377c478a4a7b Mon Sep 17 00:00:00 2001 From: CodingCat Date: Tue, 18 Feb 2025 17:14:07 -0800 Subject: [PATCH 008/120] change param ver --- .../main/scala/org/apache/celeborn/common/CelebornConf.scala | 2 +- docs/configuration/client.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala b/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala index 556d269f023..5a0e3cc764f 100644 --- a/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala +++ b/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala @@ -4817,7 +4817,7 @@ object CelebornConf extends Logging { val CLIENT_FETCH_CLEAN_FAILED_SHUFFLE: ConfigEntry[Boolean] = buildConf("celeborn.client.spark.fetch.cleanFailedShuffle") .categories("client") - .version("0.4.1") + .version("0.6.0") .doc("whether to clean those disk space occupied by shuffles which cannot be fetched") .booleanConf .createWithDefault(false) diff --git a/docs/configuration/client.md b/docs/configuration/client.md index 45fefb67bf1..8e8f6c03a2c 100644 --- a/docs/configuration/client.md +++ b/docs/configuration/client.md @@ -111,7 +111,7 @@ license: | | celeborn.client.shuffle.register.filterExcludedWorker.enabled | false | false | Whether to filter excluded worker when register shuffle. | 0.4.0 | | | celeborn.client.shuffle.reviseLostShuffles.enabled | false | false | Whether to revise lost shuffles. | 0.6.0 | | | celeborn.client.slot.assign.maxWorkers | 10000 | false | Max workers that slots of one shuffle can be allocated on. Will choose the smaller positive one from Master side and Client side, see `celeborn.master.slot.assign.maxWorkers`. | 0.3.1 | | -| celeborn.client.spark.fetch.cleanFailedShuffle | false | false | whether to clean those disk space occupied by shuffles which cannot be fetched | 0.4.1 | | +| celeborn.client.spark.fetch.cleanFailedShuffle | false | false | whether to clean those disk space occupied by shuffles which cannot be fetched | 0.6.0 | | | celeborn.client.spark.push.dynamicWriteMode.enabled | false | false | Whether to dynamically switch push write mode based on conditions.If true, shuffle mode will be only determined by partition count | 0.5.0 | | | celeborn.client.spark.push.dynamicWriteMode.partitionNum.threshold | 2000 | false | Threshold of shuffle partition number for dynamically switching push writer mode. When the shuffle partition number is greater than this value, use the sort-based shuffle writer for memory efficiency; otherwise use the hash-based shuffle writer for speed. This configuration only takes effect when celeborn.client.spark.push.dynamicWriteMode.enabled is true. | 0.5.0 | | | celeborn.client.spark.push.sort.memory.maxMemoryFactor | 0.4 | false | the max portion of executor memory which can be used for SortBasedWriter buffer (only valid when celeborn.client.spark.push.sort.memory.useAdaptiveThreshold is enabled | 0.5.0 | | From 4db2d88581cd3c79dc48eaff6f1130355e003da3 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Tue, 18 Feb 2025 18:14:38 -0800 Subject: [PATCH 009/120] add debug info --- .../tests/spark/CelebornFetchFailureDiskCleanSuite.scala | 2 ++ .../tests/spark/fetch_failure/FetchFailureTestBase.scala | 3 ++- .../tests/spark/fetch_failure/ShuffleReaderGetHooks.scala | 8 +++++++- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala index 49404cd1926..0d08882ad6b 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala @@ -76,6 +76,7 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite } } } + class CheckingThreadForStableStatus( shuffleIdShouldNotExist: Seq[Int], shuffleIdMustExist: Seq[Int], @@ -105,6 +106,7 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite } } } + private def triggerStorageCheckThread( shuffleIdShouldNotExist: Seq[Int], shuffleIdMustExist: Seq[Int], diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureTestBase.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureTestBase.scala index c6363cc7411..42398cce495 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureTestBase.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureTestBase.scala @@ -29,7 +29,7 @@ private[tests] trait FetchFailureTestBase extends SparkTestBase { overrideShuffleMgr: Boolean = true, enableFailedShuffleCleaner: Boolean = false): SparkSession = { val sparkConf = new SparkConf().setAppName("rss-demo").setMaster("local[2,3]") - new SparkConf().setAppName("rss-demo").setMaster("local[2,3]") + var baseBuilder = SparkSession.builder() .config(updateSparkConf(sparkConf, ShuffleMode.HASH)) .config("spark.sql.shuffle.partitions", 2) @@ -38,6 +38,7 @@ private[tests] trait FetchFailureTestBase extends SparkTestBase { .config("spark.celeborn.client.shuffle.expired.checkInterval", "1s") .config("spark.kryoserializer.buffer.max", "2047m") .config("spark.celeborn.client.spark.fetch.throwsFetchFailure", "true") + baseBuilder = if (overrideShuffleMgr) { baseBuilder.config( diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/ShuffleReaderGetHooks.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/ShuffleReaderGetHooks.scala index cf22e1bcaf2..34884d2a1c6 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/ShuffleReaderGetHooks.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/ShuffleReaderGetHooks.scala @@ -31,8 +31,10 @@ import org.apache.celeborn.common.CelebornConf class FailCommitShuffleReaderGetHook( conf: CelebornConf) extends ShuffleManagerHook { + var executed: AtomicBoolean = new AtomicBoolean(false) val lock = new Object + override def exec( handle: ShuffleHandle, startPartition: Int, @@ -72,8 +74,10 @@ class FileDeletionShuffleReaderGetHook( shuffleIdToBeDeleted: Seq[Int] = Seq(), triggerStageId: Option[Int] = None) extends ShuffleManagerHook { + var executed: AtomicBoolean = new AtomicBoolean(false) val lock = new Object + private def deleteDataFile(appUniqueId: String, celebornShuffleId: Int): Unit = { val datafile = workerDirs.map(dir => { @@ -87,6 +91,7 @@ class FileDeletionShuffleReaderGetHook( case None => throw new RuntimeException("unexpected, there must be some data file") } } + override def exec( handle: ShuffleHandle, startPartition: Int, @@ -118,7 +123,8 @@ class FileDeletionShuffleReaderGetHook( executed.set(true) } } - case _ => throw new RuntimeException("unexpected, only support RssShuffleHandle here") + case x => throw new RuntimeException(s"unexpected, only support RssShuffleHandle here," + + s" but get ${x.getClass.getCanonicalName}") } } } From 422784e84691250500aa5bb2aeb1b6a50fc45fb5 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Tue, 18 Feb 2025 21:14:47 -0800 Subject: [PATCH 010/120] lint --- .../tests/spark/fetch_failure/ShuffleReaderGetHooks.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/ShuffleReaderGetHooks.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/ShuffleReaderGetHooks.scala index 34884d2a1c6..d60d6487f67 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/ShuffleReaderGetHooks.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/ShuffleReaderGetHooks.scala @@ -124,7 +124,7 @@ class FileDeletionShuffleReaderGetHook( } } case x => throw new RuntimeException(s"unexpected, only support RssShuffleHandle here," + - s" but get ${x.getClass.getCanonicalName}") + s" but get ${x.getClass.getCanonicalName}") } } } From 29d209471f2eee2991b4a332d8f75ae42229d517 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Wed, 19 Feb 2025 08:38:23 -0800 Subject: [PATCH 011/120] try less number of workers --- .../tests/spark/CelebornFetchFailureDiskCleanSuite.scala | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala index 0d08882ad6b..231ac50f84f 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala @@ -34,6 +34,11 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite with FetchFailureTestBase with BeforeAndAfterEach { + override def beforeAll(): Unit = { + logInfo("test initialized , setup Celeborn mini cluster") + setupMiniClusterWithRandomPorts(workerNum = 2) + } + override def beforeEach(): Unit = { ShuffleClient.reset() FailedShuffleCleaner.reset() @@ -242,6 +247,7 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite sparkSession.stop() } } + // 6. for multiple level M - 1 lineage , all failed disk spaces are cleaned test("celeborn spark integration test - (M-1 dep with multi-level lineage) the failed shuffle files are all cleaned up" + " correctly") { From 136df5edf03db21d5731bffd0c041a247f118a8b Mon Sep 17 00:00:00 2001 From: CodingCat Date: Thu, 20 Feb 2025 16:17:17 -0800 Subject: [PATCH 012/120] ignore fetchfailure test for now to see whether it is concurrency issue --- .../celeborn/tests/spark/CelebornFetchFailureSuite.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureSuite.scala index a755b1581f1..55988279c23 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureSuite.scala @@ -25,13 +25,13 @@ import org.apache.spark.celeborn.ExceptionMakerHelper import org.apache.spark.rdd.RDD import org.apache.spark.shuffle.celeborn.{SparkShuffleManager, SparkUtils, TestCelebornShuffleManager} import org.apache.spark.sql.SparkSession -import org.scalatest.BeforeAndAfterEach +import org.scalatest.{BeforeAndAfterEach, Ignore} import org.scalatest.funsuite.AnyFunSuite - import org.apache.celeborn.client.ShuffleClient import org.apache.celeborn.common.protocol.ShuffleMode import org.apache.celeborn.tests.spark.fetch_failure.{FetchFailureTestBase, FileDeletionShuffleReaderGetHook} +@Ignore class CelebornFetchFailureSuite extends AnyFunSuite with FetchFailureTestBase with BeforeAndAfterEach { From 7c4ff3292c271a1e490fd6b4d803ea9375b301e1 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Thu, 20 Feb 2025 16:20:54 -0800 Subject: [PATCH 013/120] lint --- .../apache/celeborn/tests/spark/CelebornFetchFailureSuite.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureSuite.scala index 55988279c23..4dba6a5691e 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureSuite.scala @@ -27,6 +27,7 @@ import org.apache.spark.shuffle.celeborn.{SparkShuffleManager, SparkUtils, TestC import org.apache.spark.sql.SparkSession import org.scalatest.{BeforeAndAfterEach, Ignore} import org.scalatest.funsuite.AnyFunSuite + import org.apache.celeborn.client.ShuffleClient import org.apache.celeborn.common.protocol.ShuffleMode import org.apache.celeborn.tests.spark.fetch_failure.{FetchFailureTestBase, FileDeletionShuffleReaderGetHook} From 8714dcee4da48c14cd4584b0ebfac4a0f0d5800a Mon Sep 17 00:00:00 2001 From: CodingCat Date: Fri, 21 Feb 2025 08:47:56 -0800 Subject: [PATCH 014/120] try 1 worker --- .../tests/spark/CelebornFetchFailureDiskCleanSuite.scala | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala index 231ac50f84f..702d6a60014 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala @@ -36,7 +36,7 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite override def beforeAll(): Unit = { logInfo("test initialized , setup Celeborn mini cluster") - setupMiniClusterWithRandomPorts(workerNum = 2) + setupMiniClusterWithRandomPorts(workerNum = 1) } override def beforeEach(): Unit = { @@ -193,6 +193,7 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite sparkSession.stop() } } + // 3. for single level M-1 lineage, the single failed disk space is cleaned test( "celeborn spark integration test - (M-1 dep with single level lineage) the single failed shuffle file is cleaned up correctly") { @@ -216,9 +217,9 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite val expect = "[2,1,2]" assert(tuples.head.toString().equals(expect)) sparkSession.stop() - println(s"end ${System.currentTimeMillis()}") } } + // 4. for single level M-1 lineage, all failed disk spaces are cleaned test("celeborn spark integration test - (M-1 dep with single-level lineage) all failed disk spaces are cleaned") { if (Spark3OrNewer) { From ee6dcaac51d7af0f38753e0fa9b372df396e30ca Mon Sep 17 00:00:00 2001 From: CodingCat Date: Fri, 21 Feb 2025 15:20:02 -0800 Subject: [PATCH 015/120] resume celeborn fetch failure suite --- .../apache/celeborn/tests/spark/CelebornFetchFailureSuite.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureSuite.scala index 4dba6a5691e..1d174d0b25d 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureSuite.scala @@ -32,7 +32,6 @@ import org.apache.celeborn.client.ShuffleClient import org.apache.celeborn.common.protocol.ShuffleMode import org.apache.celeborn.tests.spark.fetch_failure.{FetchFailureTestBase, FileDeletionShuffleReaderGetHook} -@Ignore class CelebornFetchFailureSuite extends AnyFunSuite with FetchFailureTestBase with BeforeAndAfterEach { From 446bd71bb16997b5235ae88b6d9119c4209a9d69 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Fri, 21 Feb 2025 16:59:04 -0800 Subject: [PATCH 016/120] make it only available for spark 3 --- .../java/org/apache/spark/shuffle/celeborn/SparkUtils.java | 2 +- .../main/scala/org/apache/spark/SparkContextHelper.scala | 0 .../spark}/celeborn/spark/FailedShuffleCleaner.scala | 7 ++++--- .../org/apache/spark/scheduler/RunningStageManager.scala | 0 .../tests/spark/CelebornFetchFailureDiskCleanSuite.scala | 2 +- 5 files changed, 6 insertions(+), 5 deletions(-) rename client-spark/{common => spark-3}/src/main/scala/org/apache/spark/SparkContextHelper.scala (100%) rename client-spark/{common/src/main/scala/org/apache => spark-3/src/main/scala/org/apache/spark}/celeborn/spark/FailedShuffleCleaner.scala (97%) rename client-spark/{common => spark-3}/src/main/scala/org/apache/spark/scheduler/RunningStageManager.scala (100%) diff --git a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java index 14d6de54475..23aa2f11a1a 100644 --- a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java +++ b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java @@ -45,6 +45,7 @@ import org.apache.spark.broadcast.Broadcast; import org.apache.spark.io.CompressionCodec; import org.apache.spark.io.CompressionCodec$; +import org.apache.spark.celeborn.spark.FailedShuffleCleaner; import org.apache.spark.scheduler.DAGScheduler; import org.apache.spark.scheduler.MapStatus; import org.apache.spark.scheduler.MapStatus$; @@ -76,7 +77,6 @@ import org.apache.celeborn.reflect.DynConstructors; import org.apache.celeborn.reflect.DynFields; import org.apache.celeborn.reflect.DynMethods; -import org.apache.celeborn.spark.FailedShuffleCleaner; public class SparkUtils { private static final Logger LOG = LoggerFactory.getLogger(SparkUtils.class); diff --git a/client-spark/common/src/main/scala/org/apache/spark/SparkContextHelper.scala b/client-spark/spark-3/src/main/scala/org/apache/spark/SparkContextHelper.scala similarity index 100% rename from client-spark/common/src/main/scala/org/apache/spark/SparkContextHelper.scala rename to client-spark/spark-3/src/main/scala/org/apache/spark/SparkContextHelper.scala diff --git a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala b/client-spark/spark-3/src/main/scala/org/apache/spark/celeborn/spark/FailedShuffleCleaner.scala similarity index 97% rename from client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala rename to client-spark/spark-3/src/main/scala/org/apache/spark/celeborn/spark/FailedShuffleCleaner.scala index 4596e91dbc2..b1574d75c57 100644 --- a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala +++ b/client-spark/spark-3/src/main/scala/org/apache/spark/celeborn/spark/FailedShuffleCleaner.scala @@ -14,7 +14,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.celeborn.spark +package org.apache.spark.celeborn.spark + import java.util import java.util.concurrent.{ConcurrentHashMap, LinkedBlockingQueue} import java.util.concurrent.atomic.AtomicReference @@ -27,14 +28,14 @@ import org.apache.spark.scheduler.{RunningStageManager, RunningStageManagerImpl} import org.apache.celeborn.client.LifecycleManager import org.apache.celeborn.common.internal.Logging -private[celeborn] object FailedShuffleCleaner extends Logging { +object FailedShuffleCleaner extends Logging { private val lifecycleManager = new AtomicReference[LifecycleManager](null) // in celeborn ids private val shufflesToBeCleand = new LinkedBlockingQueue[Int]() private val cleanedShuffleIds = new mutable.HashSet[Int] // celeborn shuffle id to stage id referred to it - private[celeborn] val celebornShuffleIdToReferringStages = + val celebornShuffleIdToReferringStages = new ConcurrentHashMap[Int, mutable.HashSet[Int]]() private val lock = new Object diff --git a/client-spark/common/src/main/scala/org/apache/spark/scheduler/RunningStageManager.scala b/client-spark/spark-3/src/main/scala/org/apache/spark/scheduler/RunningStageManager.scala similarity index 100% rename from client-spark/common/src/main/scala/org/apache/spark/scheduler/RunningStageManager.scala rename to client-spark/spark-3/src/main/scala/org/apache/spark/scheduler/RunningStageManager.scala diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala index 702d6a60014..cf2fec05cfa 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala @@ -20,6 +20,7 @@ import java.io.File import scala.collection.mutable +import org.apache.spark.celeborn.spark.FailedShuffleCleaner import org.apache.spark.shuffle.celeborn.{SparkUtils, TestCelebornShuffleManager} import org.apache.spark.sql.SparkSession import org.scalatest.BeforeAndAfterEach @@ -27,7 +28,6 @@ import org.scalatest.funsuite.AnyFunSuite import org.apache.celeborn.client.ShuffleClient import org.apache.celeborn.service.deploy.worker.Worker -import org.apache.celeborn.spark.FailedShuffleCleaner import org.apache.celeborn.tests.spark.fetch_failure.{FailCommitShuffleReaderGetHook, FetchFailureTestBase, FileDeletionShuffleReaderGetHook, TestRunningStageManager} class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite From f30b5a0348c06f5d331d0b33ac9585c7415ae79a Mon Sep 17 00:00:00 2001 From: CodingCat Date: Fri, 21 Feb 2025 18:11:28 -0800 Subject: [PATCH 017/120] Revert "make it only available for spark 3" This reverts commit 9bd20e775bf8b5519f20a3ab2a799d03b04e0f25. --- .../celeborn/spark/FailedShuffleCleaner.scala | 150 ++++++++++++++++++ .../org/apache/spark/SparkContextHelper.scala | 26 +++ .../spark/scheduler/RunningStageManager.scala | 30 ++++ .../spark/shuffle/celeborn/SparkUtils.java | 1 + .../celeborn/spark/FailedShuffleCleaner.scala | 7 +- .../CelebornFetchFailureDiskCleanSuite.scala | 2 +- 6 files changed, 211 insertions(+), 5 deletions(-) create mode 100644 client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala create mode 100644 client-spark/common/src/main/scala/org/apache/spark/SparkContextHelper.scala create mode 100644 client-spark/common/src/main/scala/org/apache/spark/scheduler/RunningStageManager.scala diff --git a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala new file mode 100644 index 00000000000..4596e91dbc2 --- /dev/null +++ b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala @@ -0,0 +1,150 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.celeborn.spark +import java.util +import java.util.concurrent.{ConcurrentHashMap, LinkedBlockingQueue} +import java.util.concurrent.atomic.AtomicReference + +import scala.collection.JavaConverters._ +import scala.collection.mutable + +import org.apache.spark.scheduler.{RunningStageManager, RunningStageManagerImpl} + +import org.apache.celeborn.client.LifecycleManager +import org.apache.celeborn.common.internal.Logging + +private[celeborn] object FailedShuffleCleaner extends Logging { + + private val lifecycleManager = new AtomicReference[LifecycleManager](null) + // in celeborn ids + private val shufflesToBeCleand = new LinkedBlockingQueue[Int]() + private val cleanedShuffleIds = new mutable.HashSet[Int] + // celeborn shuffle id to stage id referred to it + private[celeborn] val celebornShuffleIdToReferringStages = + new ConcurrentHashMap[Int, mutable.HashSet[Int]]() + + private val lock = new Object + val RUNNING_STAGE_CHECKER_CLASS = "CELEBORN_TEST_RUNNING_STAGE_CHECKER_IMPL" + + private[celeborn] var runningStageManager: RunningStageManager = buildRunningStageChecker() + + // for testing + private def buildRunningStageChecker(): RunningStageManager = { + if (System.getProperty(RUNNING_STAGE_CHECKER_CLASS) == null) { + new RunningStageManagerImpl + } else { + val className = System.getProperty(RUNNING_STAGE_CHECKER_CLASS) + val claz = Class.forName(className) + claz.getDeclaredConstructor().newInstance().asInstanceOf[RunningStageManager] + } + } + + // for test + def reset(): Unit = { + lifecycleManager.set(null) + shufflesToBeCleand.clear() + cleanedShuffleIds.clear() + celebornShuffleIdToReferringStages.clear() + runningStageManager = buildRunningStageChecker() + } + + def addShuffleIdReferringStage(celebornShuffleId: Int, appShuffleIdentifier: String): Unit = { + // this is only implemented/tested with Spark for now + val Array(_, stageId, _) = appShuffleIdentifier.split('-') + val stageIds = + celebornShuffleIdToReferringStages.computeIfAbsent( + celebornShuffleId, + (_: Int) => new mutable.HashSet[Int]()) + lock.synchronized { + stageIds.add(stageId.toInt) + } + } + + private def onlyCurrentStageReferred(celebornShuffleId: Int, stageId: Int): Boolean = { + val ret = celebornShuffleIdToReferringStages.get(celebornShuffleId).size == 1 && + celebornShuffleIdToReferringStages.get(celebornShuffleId).contains(stageId) + if (ret) { + logInfo(s"only stage $stageId refers to shuffle $celebornShuffleId, adding for clean up") + } + ret + } + + def addShuffleIdToBeCleaned( + celebornShuffleId: Int, + appShuffleIdentifier: String): Unit = { + val Array(appShuffleId, stageId, _) = appShuffleIdentifier.split('-') + lifecycleManager.get().getShuffleIdMapping.get(appShuffleId.toInt).foreach { + case (pastAppShuffleIdentifier, (celebornShuffleId, _)) => { + if (!celebornShuffleIdToReferringStages.containsKey(celebornShuffleId) + || onlyCurrentStageReferred(celebornShuffleId, stageId.toInt) + || noRunningDownstreamStage(celebornShuffleId) + || !committedSuccessfully(celebornShuffleId)) { + val Array(_, stageId, attemptId) = pastAppShuffleIdentifier.split('-') + shufflesToBeCleand.put(celebornShuffleId) + } + } + } + } + + private def committedSuccessfully(celebornShuffleId: Int): Boolean = { + val ret = !lifecycleManager.get().commitManager.getCommitHandler(celebornShuffleId) + .isStageDataLost(celebornShuffleId) + if (!ret) { + logInfo(s"shuffle $celebornShuffleId is failed to commit, adding for cleaning up") + } + ret + } + + def setLifecycleManager(ref: LifecycleManager): Unit = { + lifecycleManager.compareAndSet(null, ref) + } + + private def noRunningDownstreamStage(shuffleId: Int): Boolean = { + val allReferringStageIds = celebornShuffleIdToReferringStages.get(shuffleId) + require(allReferringStageIds != null, s"no stage referring to shuffle $shuffleId") + val ret = + allReferringStageIds.count(stageId => runningStageManager.isRunningStage(stageId)) == 0 + if (ret) { + logInfo(s"no running downstream stages refers to $shuffleId") + } else { + logInfo(s"there is more than one running downstream stage referring to shuffle $shuffleId," + + s" ignore it for cleanup ") + } + ret + } + + private val cleanerThread = new Thread() { + override def run(): Unit = { + while (true) { + val allShuffleIds = new util.ArrayList[Int] + shufflesToBeCleand.drainTo(allShuffleIds) + allShuffleIds.asScala.foreach { shuffleId => + if (!cleanedShuffleIds.contains(shuffleId)) { + lifecycleManager.get().unregisterShuffle(shuffleId) + logInfo(s"sent unregister shuffle request for shuffle $shuffleId (celeborn shuffle id)") + cleanedShuffleIds += shuffleId + } + } + Thread.sleep(1000) + } + } + } + + cleanerThread.setName("shuffle cleaner thread") + cleanerThread.setDaemon(true) + cleanerThread.start() +} diff --git a/client-spark/common/src/main/scala/org/apache/spark/SparkContextHelper.scala b/client-spark/common/src/main/scala/org/apache/spark/SparkContextHelper.scala new file mode 100644 index 00000000000..cc7bd44ed7f --- /dev/null +++ b/client-spark/common/src/main/scala/org/apache/spark/SparkContextHelper.scala @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark + +object SparkContextHelper { + + def env: SparkEnv = { + assert(SparkContext.getActive.isDefined) + SparkContext.getActive.get.env + } +} diff --git a/client-spark/common/src/main/scala/org/apache/spark/scheduler/RunningStageManager.scala b/client-spark/common/src/main/scala/org/apache/spark/scheduler/RunningStageManager.scala new file mode 100644 index 00000000000..daa9688c4e9 --- /dev/null +++ b/client-spark/common/src/main/scala/org/apache/spark/scheduler/RunningStageManager.scala @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.scheduler + +import org.apache.spark.SparkContext + +trait RunningStageManager { + def isRunningStage(stageId: Int): Boolean +} + +class RunningStageManagerImpl extends RunningStageManager { + private def dagScheduler = SparkContext.getActive.get.dagScheduler + override def isRunningStage(stageId: Int): Boolean = { + dagScheduler.runningStages.map(_.id).contains(stageId) + } +} diff --git a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java index 23aa2f11a1a..ad173adb922 100644 --- a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java +++ b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java @@ -77,6 +77,7 @@ import org.apache.celeborn.reflect.DynConstructors; import org.apache.celeborn.reflect.DynFields; import org.apache.celeborn.reflect.DynMethods; +import org.apache.celeborn.spark.FailedShuffleCleaner; public class SparkUtils { private static final Logger LOG = LoggerFactory.getLogger(SparkUtils.class); diff --git a/client-spark/spark-3/src/main/scala/org/apache/spark/celeborn/spark/FailedShuffleCleaner.scala b/client-spark/spark-3/src/main/scala/org/apache/spark/celeborn/spark/FailedShuffleCleaner.scala index b1574d75c57..4596e91dbc2 100644 --- a/client-spark/spark-3/src/main/scala/org/apache/spark/celeborn/spark/FailedShuffleCleaner.scala +++ b/client-spark/spark-3/src/main/scala/org/apache/spark/celeborn/spark/FailedShuffleCleaner.scala @@ -14,8 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.spark.celeborn.spark - +package org.apache.celeborn.spark import java.util import java.util.concurrent.{ConcurrentHashMap, LinkedBlockingQueue} import java.util.concurrent.atomic.AtomicReference @@ -28,14 +27,14 @@ import org.apache.spark.scheduler.{RunningStageManager, RunningStageManagerImpl} import org.apache.celeborn.client.LifecycleManager import org.apache.celeborn.common.internal.Logging -object FailedShuffleCleaner extends Logging { +private[celeborn] object FailedShuffleCleaner extends Logging { private val lifecycleManager = new AtomicReference[LifecycleManager](null) // in celeborn ids private val shufflesToBeCleand = new LinkedBlockingQueue[Int]() private val cleanedShuffleIds = new mutable.HashSet[Int] // celeborn shuffle id to stage id referred to it - val celebornShuffleIdToReferringStages = + private[celeborn] val celebornShuffleIdToReferringStages = new ConcurrentHashMap[Int, mutable.HashSet[Int]]() private val lock = new Object diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala index cf2fec05cfa..702d6a60014 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala @@ -20,7 +20,6 @@ import java.io.File import scala.collection.mutable -import org.apache.spark.celeborn.spark.FailedShuffleCleaner import org.apache.spark.shuffle.celeborn.{SparkUtils, TestCelebornShuffleManager} import org.apache.spark.sql.SparkSession import org.scalatest.BeforeAndAfterEach @@ -28,6 +27,7 @@ import org.scalatest.funsuite.AnyFunSuite import org.apache.celeborn.client.ShuffleClient import org.apache.celeborn.service.deploy.worker.Worker +import org.apache.celeborn.spark.FailedShuffleCleaner import org.apache.celeborn.tests.spark.fetch_failure.{FailCommitShuffleReaderGetHook, FetchFailureTestBase, FileDeletionShuffleReaderGetHook, TestRunningStageManager} class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite From eac8e4be2f12d20809a1860ccebe1aa6e509992b Mon Sep 17 00:00:00 2001 From: CodingCat Date: Fri, 21 Feb 2025 19:08:01 -0800 Subject: [PATCH 018/120] compatible with 2.11 --- .../org/apache/celeborn/spark/FailedShuffleCleaner.scala | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala index 4596e91dbc2..6a8e3afa81b 100644 --- a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala +++ b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala @@ -65,12 +65,9 @@ private[celeborn] object FailedShuffleCleaner extends Logging { def addShuffleIdReferringStage(celebornShuffleId: Int, appShuffleIdentifier: String): Unit = { // this is only implemented/tested with Spark for now val Array(_, stageId, _) = appShuffleIdentifier.split('-') - val stageIds = - celebornShuffleIdToReferringStages.computeIfAbsent( - celebornShuffleId, - (_: Int) => new mutable.HashSet[Int]()) + celebornShuffleIdToReferringStages.putIfAbsent(celebornShuffleId, new mutable.HashSet[Int]()) lock.synchronized { - stageIds.add(stageId.toInt) + celebornShuffleIdToReferringStages.get(celebornShuffleId).add(stageId.toInt) } } From 35f6db0ad982736759b97023fcccdba28f3f6483 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Fri, 7 Mar 2025 08:19:16 -0800 Subject: [PATCH 019/120] fix rebase errors --- .../org/apache/spark/SparkContextHelper.scala | 26 --- .../celeborn/spark/FailedShuffleCleaner.scala | 150 ------------------ .../spark/scheduler/RunningStageManager.scala | 30 ---- 3 files changed, 206 deletions(-) delete mode 100644 client-spark/spark-3/src/main/scala/org/apache/spark/SparkContextHelper.scala delete mode 100644 client-spark/spark-3/src/main/scala/org/apache/spark/celeborn/spark/FailedShuffleCleaner.scala delete mode 100644 client-spark/spark-3/src/main/scala/org/apache/spark/scheduler/RunningStageManager.scala diff --git a/client-spark/spark-3/src/main/scala/org/apache/spark/SparkContextHelper.scala b/client-spark/spark-3/src/main/scala/org/apache/spark/SparkContextHelper.scala deleted file mode 100644 index cc7bd44ed7f..00000000000 --- a/client-spark/spark-3/src/main/scala/org/apache/spark/SparkContextHelper.scala +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark - -object SparkContextHelper { - - def env: SparkEnv = { - assert(SparkContext.getActive.isDefined) - SparkContext.getActive.get.env - } -} diff --git a/client-spark/spark-3/src/main/scala/org/apache/spark/celeborn/spark/FailedShuffleCleaner.scala b/client-spark/spark-3/src/main/scala/org/apache/spark/celeborn/spark/FailedShuffleCleaner.scala deleted file mode 100644 index 4596e91dbc2..00000000000 --- a/client-spark/spark-3/src/main/scala/org/apache/spark/celeborn/spark/FailedShuffleCleaner.scala +++ /dev/null @@ -1,150 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.celeborn.spark -import java.util -import java.util.concurrent.{ConcurrentHashMap, LinkedBlockingQueue} -import java.util.concurrent.atomic.AtomicReference - -import scala.collection.JavaConverters._ -import scala.collection.mutable - -import org.apache.spark.scheduler.{RunningStageManager, RunningStageManagerImpl} - -import org.apache.celeborn.client.LifecycleManager -import org.apache.celeborn.common.internal.Logging - -private[celeborn] object FailedShuffleCleaner extends Logging { - - private val lifecycleManager = new AtomicReference[LifecycleManager](null) - // in celeborn ids - private val shufflesToBeCleand = new LinkedBlockingQueue[Int]() - private val cleanedShuffleIds = new mutable.HashSet[Int] - // celeborn shuffle id to stage id referred to it - private[celeborn] val celebornShuffleIdToReferringStages = - new ConcurrentHashMap[Int, mutable.HashSet[Int]]() - - private val lock = new Object - val RUNNING_STAGE_CHECKER_CLASS = "CELEBORN_TEST_RUNNING_STAGE_CHECKER_IMPL" - - private[celeborn] var runningStageManager: RunningStageManager = buildRunningStageChecker() - - // for testing - private def buildRunningStageChecker(): RunningStageManager = { - if (System.getProperty(RUNNING_STAGE_CHECKER_CLASS) == null) { - new RunningStageManagerImpl - } else { - val className = System.getProperty(RUNNING_STAGE_CHECKER_CLASS) - val claz = Class.forName(className) - claz.getDeclaredConstructor().newInstance().asInstanceOf[RunningStageManager] - } - } - - // for test - def reset(): Unit = { - lifecycleManager.set(null) - shufflesToBeCleand.clear() - cleanedShuffleIds.clear() - celebornShuffleIdToReferringStages.clear() - runningStageManager = buildRunningStageChecker() - } - - def addShuffleIdReferringStage(celebornShuffleId: Int, appShuffleIdentifier: String): Unit = { - // this is only implemented/tested with Spark for now - val Array(_, stageId, _) = appShuffleIdentifier.split('-') - val stageIds = - celebornShuffleIdToReferringStages.computeIfAbsent( - celebornShuffleId, - (_: Int) => new mutable.HashSet[Int]()) - lock.synchronized { - stageIds.add(stageId.toInt) - } - } - - private def onlyCurrentStageReferred(celebornShuffleId: Int, stageId: Int): Boolean = { - val ret = celebornShuffleIdToReferringStages.get(celebornShuffleId).size == 1 && - celebornShuffleIdToReferringStages.get(celebornShuffleId).contains(stageId) - if (ret) { - logInfo(s"only stage $stageId refers to shuffle $celebornShuffleId, adding for clean up") - } - ret - } - - def addShuffleIdToBeCleaned( - celebornShuffleId: Int, - appShuffleIdentifier: String): Unit = { - val Array(appShuffleId, stageId, _) = appShuffleIdentifier.split('-') - lifecycleManager.get().getShuffleIdMapping.get(appShuffleId.toInt).foreach { - case (pastAppShuffleIdentifier, (celebornShuffleId, _)) => { - if (!celebornShuffleIdToReferringStages.containsKey(celebornShuffleId) - || onlyCurrentStageReferred(celebornShuffleId, stageId.toInt) - || noRunningDownstreamStage(celebornShuffleId) - || !committedSuccessfully(celebornShuffleId)) { - val Array(_, stageId, attemptId) = pastAppShuffleIdentifier.split('-') - shufflesToBeCleand.put(celebornShuffleId) - } - } - } - } - - private def committedSuccessfully(celebornShuffleId: Int): Boolean = { - val ret = !lifecycleManager.get().commitManager.getCommitHandler(celebornShuffleId) - .isStageDataLost(celebornShuffleId) - if (!ret) { - logInfo(s"shuffle $celebornShuffleId is failed to commit, adding for cleaning up") - } - ret - } - - def setLifecycleManager(ref: LifecycleManager): Unit = { - lifecycleManager.compareAndSet(null, ref) - } - - private def noRunningDownstreamStage(shuffleId: Int): Boolean = { - val allReferringStageIds = celebornShuffleIdToReferringStages.get(shuffleId) - require(allReferringStageIds != null, s"no stage referring to shuffle $shuffleId") - val ret = - allReferringStageIds.count(stageId => runningStageManager.isRunningStage(stageId)) == 0 - if (ret) { - logInfo(s"no running downstream stages refers to $shuffleId") - } else { - logInfo(s"there is more than one running downstream stage referring to shuffle $shuffleId," + - s" ignore it for cleanup ") - } - ret - } - - private val cleanerThread = new Thread() { - override def run(): Unit = { - while (true) { - val allShuffleIds = new util.ArrayList[Int] - shufflesToBeCleand.drainTo(allShuffleIds) - allShuffleIds.asScala.foreach { shuffleId => - if (!cleanedShuffleIds.contains(shuffleId)) { - lifecycleManager.get().unregisterShuffle(shuffleId) - logInfo(s"sent unregister shuffle request for shuffle $shuffleId (celeborn shuffle id)") - cleanedShuffleIds += shuffleId - } - } - Thread.sleep(1000) - } - } - } - - cleanerThread.setName("shuffle cleaner thread") - cleanerThread.setDaemon(true) - cleanerThread.start() -} diff --git a/client-spark/spark-3/src/main/scala/org/apache/spark/scheduler/RunningStageManager.scala b/client-spark/spark-3/src/main/scala/org/apache/spark/scheduler/RunningStageManager.scala deleted file mode 100644 index daa9688c4e9..00000000000 --- a/client-spark/spark-3/src/main/scala/org/apache/spark/scheduler/RunningStageManager.scala +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.spark.scheduler - -import org.apache.spark.SparkContext - -trait RunningStageManager { - def isRunningStage(stageId: Int): Boolean -} - -class RunningStageManagerImpl extends RunningStageManager { - private def dagScheduler = SparkContext.getActive.get.dagScheduler - override def isRunningStage(stageId: Int): Boolean = { - dagScheduler.runningStages.map(_.id).contains(stageId) - } -} From 59ea64f1f2cd0f8be8b40d133719cf3995e325f4 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Fri, 7 Mar 2025 16:01:37 -0800 Subject: [PATCH 020/120] more time to finish test --- .../tests/spark/CelebornFetchFailureDiskCleanSuite.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala index 702d6a60014..8d8ca5416af 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala @@ -88,7 +88,7 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite sparkSession: SparkSession) extends CheckingThread(shuffleIdShouldNotExist, shuffleIdMustExist, sparkSession) { override def run(): Unit = { - val timeout = 60000 + val timeout = 240000 var elapseTime = 0L var allDataInShape = checkDirStatus() while (!allDataInShape) { @@ -128,7 +128,7 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite checkingThread } private def checkStorageValidation(checkingThread: Thread): Unit = { - checkingThread.join(120 * 1000) + checkingThread.join(240 * 1000) if (checkingThread.isAlive || checkingThread.asInstanceOf[CheckingThread].exception != null) { throw new IllegalStateException("the storage checking status failed," + s"${}") From 1e471f855a303f9956bdf7d6af33b9de1797a978 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Fri, 7 Mar 2025 17:19:10 -0800 Subject: [PATCH 021/120] Revert "more time to finish test" This reverts commit d397ecebb3017cca15f82db87b1c39674361d418. --- .../tests/spark/CelebornFetchFailureDiskCleanSuite.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala index 8d8ca5416af..702d6a60014 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala @@ -88,7 +88,7 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite sparkSession: SparkSession) extends CheckingThread(shuffleIdShouldNotExist, shuffleIdMustExist, sparkSession) { override def run(): Unit = { - val timeout = 240000 + val timeout = 60000 var elapseTime = 0L var allDataInShape = checkDirStatus() while (!allDataInShape) { @@ -128,7 +128,7 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite checkingThread } private def checkStorageValidation(checkingThread: Thread): Unit = { - checkingThread.join(240 * 1000) + checkingThread.join(120 * 1000) if (checkingThread.isAlive || checkingThread.asInstanceOf[CheckingThread].exception != null) { throw new IllegalStateException("the storage checking status failed," + s"${}") From fbf36b455275228d474f01388d4fc02490f7eb97 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Fri, 7 Mar 2025 20:35:43 -0800 Subject: [PATCH 022/120] add more msg got storage --- .../tests/spark/CelebornFetchFailureDiskCleanSuite.scala | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala index 702d6a60014..f3c9eee4b98 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala @@ -60,6 +60,7 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite sparkSession: SparkSession) extends Thread { var exception: Exception = _ + protected def checkDirStatus(): Boolean = { val deletedSuccessfully = shuffleIdShouldNotExist.forall(shuffleId => { workerDirs.forall(dir => @@ -73,6 +74,7 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite }) deletedSuccessfully && createdSuccessfully } + override def run(): Unit = { var allDataInShape = checkDirStatus() while (!allDataInShape) { @@ -127,13 +129,15 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite checkingThread.start() checkingThread } + private def checkStorageValidation(checkingThread: Thread): Unit = { checkingThread.join(120 * 1000) if (checkingThread.isAlive || checkingThread.asInstanceOf[CheckingThread].exception != null) { throw new IllegalStateException("the storage checking status failed," + - s"${}") + s"${checkingThread.isAlive} ${checkingThread.asInstanceOf[CheckingThread].exception.getMessage}") } } + // 1. for single level 1-1 lineage, the old disk space is cleaned before the spark application // finish test("celeborn spark integration test - (1-1 dep with, single level lineage) the failed shuffle file is cleaned up correctly") { From 2ed92b21b4a24bbc1b651554aff1a4d88b44d551 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Sat, 8 Mar 2025 12:58:19 -0800 Subject: [PATCH 023/120] remove first few tests and test what happened --- .../tests/spark/CelebornFetchFailureDiskCleanSuite.scala | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala index f3c9eee4b98..d3f1046715f 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala @@ -138,6 +138,7 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite } } + /* // 1. for single level 1-1 lineage, the old disk space is cleaned before the spark application // finish test("celeborn spark integration test - (1-1 dep with, single level lineage) the failed shuffle file is cleaned up correctly") { @@ -251,7 +252,7 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite assert(tuples.head.toString().equals(expect)) sparkSession.stop() } - } + }*/ // 6. for multiple level M - 1 lineage , all failed disk spaces are cleaned test("celeborn spark integration test - (M-1 dep with multi-level lineage) the failed shuffle files are all cleaned up" + @@ -286,7 +287,7 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite sparkSession.stop() } } - + // 7. if the dependency is 1 to M , we should not clean it test("celeborn spark integration test - Do not clean up the shuffle files being referred by more than one stages") { if (Spark3OrNewer) { @@ -318,6 +319,7 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite sparkSession.stop() } } + // 8. if the dependency is 1 to M but failed in commit phase, we should just clean it test("celeborn spark integration test - clear the failed-to-commit shuffle file even it is referred by more than once") { if (Spark3OrNewer) { From 4d9139d5d165a105f738cff2b353b65be6f6d29c Mon Sep 17 00:00:00 2001 From: CodingCat Date: Sat, 8 Mar 2025 13:02:33 -0800 Subject: [PATCH 024/120] test --- .../spark/CelebornFetchFailureDiskCleanSuite.scala | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala index d3f1046715f..d51bb088c81 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala @@ -67,11 +67,23 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite !new File(s"$dir/celeborn-worker/shuffle_data/" + s"${sparkSession.sparkContext.applicationId}/$shuffleId").exists()) }) + val deletedSuccessfullyString = shuffleIdShouldNotExist.map(shuffleId => { + shuffleId.toString + ":" + + workerDirs.map(dir => + !new File(s"$dir/celeborn-worker/shuffle_data/" + + s"${sparkSession.sparkContext.applicationId}/$shuffleId").exists()).toList + }).mkString(",") val createdSuccessfully = shuffleIdMustExist.forall(shuffleId => { workerDirs.exists(dir => new File(s"$dir/celeborn-worker/shuffle_data/" + s"${sparkSession.sparkContext.applicationId}/$shuffleId").exists()) }) + val createdSuccessfullyString = shuffleIdMustExist.map(shuffleId => { + workerDirs.map(dir => + new File(s"$dir/celeborn-worker/shuffle_data/" + + s"${sparkSession.sparkContext.applicationId}/$shuffleId").exists()).toList + }).mkString(",") + println(s"${deletedSuccessfullyString} \t $createdSuccessfullyString") deletedSuccessfully && createdSuccessfully } @@ -287,7 +299,7 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite sparkSession.stop() } } - + // 7. if the dependency is 1 to M , we should not clean it test("celeborn spark integration test - Do not clean up the shuffle files being referred by more than one stages") { if (Spark3OrNewer) { From 5b557b08d89b75eb49d3690f78d2e6f69dcb753f Mon Sep 17 00:00:00 2001 From: CodingCat Date: Sat, 8 Mar 2025 13:19:29 -0800 Subject: [PATCH 025/120] more test --- .../tests/spark/CelebornFetchFailureDiskCleanSuite.scala | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala index d51bb088c81..ad1b99573ca 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala @@ -102,7 +102,7 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite sparkSession: SparkSession) extends CheckingThread(shuffleIdShouldNotExist, shuffleIdMustExist, sparkSession) { override def run(): Unit = { - val timeout = 60000 + val timeout = 30000 var elapseTime = 0L var allDataInShape = checkDirStatus() while (!allDataInShape) { @@ -143,7 +143,7 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite } private def checkStorageValidation(checkingThread: Thread): Unit = { - checkingThread.join(120 * 1000) + checkingThread.join(240 * 1000) if (checkingThread.isAlive || checkingThread.asInstanceOf[CheckingThread].exception != null) { throw new IllegalStateException("the storage checking status failed," + s"${checkingThread.isAlive} ${checkingThread.asInstanceOf[CheckingThread].exception.getMessage}") @@ -235,7 +235,7 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite assert(tuples.head.toString().equals(expect)) sparkSession.stop() } - } + }*/ // 4. for single level M-1 lineage, all failed disk spaces are cleaned test("celeborn spark integration test - (M-1 dep with single-level lineage) all failed disk spaces are cleaned") { @@ -264,7 +264,7 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite assert(tuples.head.toString().equals(expect)) sparkSession.stop() } - }*/ + } // 6. for multiple level M - 1 lineage , all failed disk spaces are cleaned test("celeborn spark integration test - (M-1 dep with multi-level lineage) the failed shuffle files are all cleaned up" + From b6f72859edf799e80005f3fdb072d98c90fd59d7 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Sat, 8 Mar 2025 13:24:15 -0800 Subject: [PATCH 026/120] add back one more test --- .../tests/spark/CelebornFetchFailureDiskCleanSuite.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala index ad1b99573ca..bc4f18b63c8 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala @@ -209,7 +209,7 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite } sparkSession.stop() } - } + }*/ // 3. for single level M-1 lineage, the single failed disk space is cleaned test( @@ -235,7 +235,7 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite assert(tuples.head.toString().equals(expect)) sparkSession.stop() } - }*/ + } // 4. for single level M-1 lineage, all failed disk spaces are cleaned test("celeborn spark integration test - (M-1 dep with single-level lineage) all failed disk spaces are cleaned") { From 23b230b0c77528f8bda2b8e4d096cde9d0810b5f Mon Sep 17 00:00:00 2001 From: CodingCat Date: Sat, 8 Mar 2025 13:29:11 -0800 Subject: [PATCH 027/120] one more test --- .../tests/spark/CelebornFetchFailureDiskCleanSuite.scala | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala index bc4f18b63c8..92adbd16989 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala @@ -176,7 +176,8 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite } sparkSession.stop() } - } + }*/ + // 2. for multiple level 1-1 lineage, the old disk space is cleaned one by one test("celeborn spark integration test - (1-1 dep with, multi-level lineage) the failed shuffle file is cleaned up correctly") { if (Spark3OrNewer) { @@ -209,7 +210,7 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite } sparkSession.stop() } - }*/ + } // 3. for single level M-1 lineage, the single failed disk space is cleaned test( From 72a170c2e49124ba80d0b2e98ac535693bcf7f43 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Sat, 8 Mar 2025 13:35:09 -0800 Subject: [PATCH 028/120] more debugging info --- .../tests/spark/CelebornFetchFailureDiskCleanSuite.scala | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala index 92adbd16989..e7fb36c48a2 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala @@ -79,9 +79,10 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite s"${sparkSession.sparkContext.applicationId}/$shuffleId").exists()) }) val createdSuccessfullyString = shuffleIdMustExist.map(shuffleId => { - workerDirs.map(dir => - new File(s"$dir/celeborn-worker/shuffle_data/" + - s"${sparkSession.sparkContext.applicationId}/$shuffleId").exists()).toList + shuffleId.toString + ":" + + workerDirs.map(dir => + new File(s"$dir/celeborn-worker/shuffle_data/" + + s"${sparkSession.sparkContext.applicationId}/$shuffleId").exists()).toList }).mkString(",") println(s"${deletedSuccessfullyString} \t $createdSuccessfullyString") deletedSuccessfully && createdSuccessfully @@ -102,7 +103,7 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite sparkSession: SparkSession) extends CheckingThread(shuffleIdShouldNotExist, shuffleIdMustExist, sparkSession) { override def run(): Unit = { - val timeout = 30000 + val timeout = 20000 var elapseTime = 0L var allDataInShape = checkDirStatus() while (!allDataInShape) { From fa80ed34d75afe52d014a214674dcaa5667dd7f5 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Sat, 8 Mar 2025 13:35:30 -0800 Subject: [PATCH 029/120] add back one more test --- .../tests/spark/CelebornFetchFailureDiskCleanSuite.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala index e7fb36c48a2..6e719b91a47 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala @@ -151,7 +151,6 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite } } - /* // 1. for single level 1-1 lineage, the old disk space is cleaned before the spark application // finish test("celeborn spark integration test - (1-1 dep with, single level lineage) the failed shuffle file is cleaned up correctly") { @@ -177,7 +176,7 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite } sparkSession.stop() } - }*/ + } // 2. for multiple level 1-1 lineage, the old disk space is cleaned one by one test("celeborn spark integration test - (1-1 dep with, multi-level lineage) the failed shuffle file is cleaned up correctly") { From e40c4c11df016440835ff190ddda5bf1ebc5a210 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Sat, 8 Mar 2025 13:46:04 -0800 Subject: [PATCH 030/120] handle empty message --- .../tests/spark/CelebornFetchFailureDiskCleanSuite.scala | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala index 6e719b91a47..a67ef356b03 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala @@ -143,11 +143,13 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite checkingThread } - private def checkStorageValidation(checkingThread: Thread): Unit = { + private def checkStorageValidation(thread: Thread): Unit = { + val checkingThread = thread.asInstanceOf[CheckingThread] checkingThread.join(240 * 1000) - if (checkingThread.isAlive || checkingThread.asInstanceOf[CheckingThread].exception != null) { + if (checkingThread.isAlive || checkingThread.exception != null) { throw new IllegalStateException("the storage checking status failed," + - s"${checkingThread.isAlive} ${checkingThread.asInstanceOf[CheckingThread].exception.getMessage}") + s"${checkingThread.isAlive} ${if (checkingThread.exception != null) checkingThread.exception.getMessage + else "NULL"}") } } From 413dbc66ebf30e16d634490c2d932db6d861565a Mon Sep 17 00:00:00 2001 From: CodingCat Date: Sat, 8 Mar 2025 13:58:13 -0800 Subject: [PATCH 031/120] rm useless println --- .../tests/spark/CelebornFetchFailureDiskCleanSuite.scala | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala index a67ef356b03..fefbb13b741 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala @@ -263,7 +263,6 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite // verify result assert(hook.executed.get()) val expect = "[2,1,1]" - println(tuples.head.toString()) assert(tuples.head.toString().equals(expect)) sparkSession.stop() } From 73392670b4d4597f6c5c50f188b0a258d1fc4fd6 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Sat, 8 Mar 2025 17:34:02 -0800 Subject: [PATCH 032/120] allow more time in the suspicious test --- .../tests/spark/CelebornFetchFailureDiskCleanSuite.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala index fefbb13b741..1145e46b5c0 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala @@ -143,9 +143,9 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite checkingThread } - private def checkStorageValidation(thread: Thread): Unit = { + private def checkStorageValidation(thread: Thread, timeout: Long = 60 * 1000): Unit = { val checkingThread = thread.asInstanceOf[CheckingThread] - checkingThread.join(240 * 1000) + checkingThread.join(timeout) if (checkingThread.isAlive || checkingThread.exception != null) { throw new IllegalStateException("the storage checking status failed," + s"${checkingThread.isAlive} ${if (checkingThread.exception != null) checkingThread.exception.getMessage @@ -293,7 +293,7 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite .withColumnRenamed("count", "countId").groupBy("countId").count() .withColumnRenamed("count", "df2_count") val tuples = df1.hint("merge").join(df2, "countId").select("*").collect() - checkStorageValidation(checkingThread) + checkStorageValidation(checkingThread, timeout = 600 * 1000) // verify result assert(hook.executed.get()) val expect = "[1,2,2]" From 37022599015878c5b599eb4583c98bb12890751c Mon Sep 17 00:00:00 2001 From: CodingCat Date: Sat, 8 Mar 2025 17:37:01 -0800 Subject: [PATCH 033/120] more --- .../tests/spark/CelebornFetchFailureDiskCleanSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala index 1145e46b5c0..58b462a53d5 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala @@ -143,7 +143,7 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite checkingThread } - private def checkStorageValidation(thread: Thread, timeout: Long = 60 * 1000): Unit = { + private def checkStorageValidation(thread: Thread, timeout: Long = 120 * 1000): Unit = { val checkingThread = thread.asInstanceOf[CheckingThread] checkingThread.join(timeout) if (checkingThread.isAlive || checkingThread.exception != null) { From 257e64945dad73502d3dbe8dae5a356ce8321c02 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Sat, 8 Mar 2025 18:32:47 -0800 Subject: [PATCH 034/120] try to separate test and see whether it works --- ...nFetchFailureDiskCleanExpensiveSuite.scala | 59 +++++++ .../CelebornFetchFailureDiskCleanSuite.scala | 166 +----------------- .../FetchFailureDiskCleanBase.scala | 152 ++++++++++++++++ 3 files changed, 213 insertions(+), 164 deletions(-) create mode 100644 tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanExpensiveSuite.scala create mode 100644 tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanExpensiveSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanExpensiveSuite.scala new file mode 100644 index 00000000000..c1d8690bf74 --- /dev/null +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanExpensiveSuite.scala @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.celeborn.tests.spark + +import org.apache.spark.shuffle.celeborn.{SparkUtils, TestCelebornShuffleManager} + +import org.apache.celeborn.tests.spark.fetch_failure.{FetchFailureDiskCleanBase, FileDeletionShuffleReaderGetHook} + +class CelebornFetchFailureDiskCleanExpensiveSuite extends FetchFailureDiskCleanBase { + + // 6. for multiple level M - 1 lineage , all failed disk spaces are cleaned + test("celeborn spark integration test - (M-1 dep with multi-level lineage) the failed shuffle files are all cleaned up" + + " correctly") { + if (Spark3OrNewer) { + val sparkSession = createSparkSession(enableFailedShuffleCleaner = true) + val celebornConf = SparkUtils.fromSparkConf(sparkSession.sparkContext.getConf) + val hook = new FileDeletionShuffleReaderGetHook( + celebornConf, + workerDirs, + shuffleIdToBeDeleted = Seq(0, 1, 2, 3), + triggerStageId = Some(4)) + TestCelebornShuffleManager.registerReaderGetHook(hook) + val checkingThread = triggerStorageCheckThread( + Seq(0, 1, 2, 3), + Seq(4, 5, 6, 7), + sparkSession, + forStableStatusChecking = false) + import sparkSession.implicits._ + val df1 = Seq((1, "a"), (2, "b")).toDF("id", "data").groupBy("id").count() + .withColumnRenamed("count", "countId").groupBy("countId").count() + .withColumnRenamed("count", "df1_count") + val df2 = Seq((2, "c"), (3, "d")).toDF("id", "data").groupBy("id").count() + .withColumnRenamed("count", "countId").groupBy("countId").count() + .withColumnRenamed("count", "df2_count") + val tuples = df1.hint("merge").join(df2, "countId").select("*").collect() + checkStorageValidation(checkingThread, timeout = 600 * 1000) + // verify result + assert(hook.executed.get()) + val expect = "[1,2,2]" + assert(tuples.head.toString().equals(expect)) + sparkSession.stop() + } + } + +} diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala index 58b462a53d5..823f9784ac5 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala @@ -16,142 +16,14 @@ */ package org.apache.celeborn.tests.spark -import java.io.File - import scala.collection.mutable import org.apache.spark.shuffle.celeborn.{SparkUtils, TestCelebornShuffleManager} -import org.apache.spark.sql.SparkSession -import org.scalatest.BeforeAndAfterEach -import org.scalatest.funsuite.AnyFunSuite -import org.apache.celeborn.client.ShuffleClient -import org.apache.celeborn.service.deploy.worker.Worker import org.apache.celeborn.spark.FailedShuffleCleaner -import org.apache.celeborn.tests.spark.fetch_failure.{FailCommitShuffleReaderGetHook, FetchFailureTestBase, FileDeletionShuffleReaderGetHook, TestRunningStageManager} - -class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite - with FetchFailureTestBase - with BeforeAndAfterEach { - - override def beforeAll(): Unit = { - logInfo("test initialized , setup Celeborn mini cluster") - setupMiniClusterWithRandomPorts(workerNum = 1) - } - - override def beforeEach(): Unit = { - ShuffleClient.reset() - FailedShuffleCleaner.reset() - } - - override def afterEach(): Unit = { - System.gc() - } - - override def createWorker(map: Map[String, String]): Worker = { - val storageDir = createTmpDir() - workerDirs = workerDirs :+ storageDir - super.createWorker(map ++ Map("celeborn.master.heartbeat.worker.timeout" -> "10s"), storageDir) - } - - class CheckingThread( - shuffleIdShouldNotExist: Seq[Int], - shuffleIdMustExist: Seq[Int], - sparkSession: SparkSession) - extends Thread { - var exception: Exception = _ - - protected def checkDirStatus(): Boolean = { - val deletedSuccessfully = shuffleIdShouldNotExist.forall(shuffleId => { - workerDirs.forall(dir => - !new File(s"$dir/celeborn-worker/shuffle_data/" + - s"${sparkSession.sparkContext.applicationId}/$shuffleId").exists()) - }) - val deletedSuccessfullyString = shuffleIdShouldNotExist.map(shuffleId => { - shuffleId.toString + ":" + - workerDirs.map(dir => - !new File(s"$dir/celeborn-worker/shuffle_data/" + - s"${sparkSession.sparkContext.applicationId}/$shuffleId").exists()).toList - }).mkString(",") - val createdSuccessfully = shuffleIdMustExist.forall(shuffleId => { - workerDirs.exists(dir => - new File(s"$dir/celeborn-worker/shuffle_data/" + - s"${sparkSession.sparkContext.applicationId}/$shuffleId").exists()) - }) - val createdSuccessfullyString = shuffleIdMustExist.map(shuffleId => { - shuffleId.toString + ":" + - workerDirs.map(dir => - new File(s"$dir/celeborn-worker/shuffle_data/" + - s"${sparkSession.sparkContext.applicationId}/$shuffleId").exists()).toList - }).mkString(",") - println(s"${deletedSuccessfullyString} \t $createdSuccessfullyString") - deletedSuccessfully && createdSuccessfully - } - - override def run(): Unit = { - var allDataInShape = checkDirStatus() - while (!allDataInShape) { - Thread.sleep(1000) - allDataInShape = checkDirStatus() - } - } - } - - class CheckingThreadForStableStatus( - shuffleIdShouldNotExist: Seq[Int], - shuffleIdMustExist: Seq[Int], - sparkSession: SparkSession) - extends CheckingThread(shuffleIdShouldNotExist, shuffleIdMustExist, sparkSession) { - override def run(): Unit = { - val timeout = 20000 - var elapseTime = 0L - var allDataInShape = checkDirStatus() - while (!allDataInShape) { - Thread.sleep(5000) - println("init state not meet") - allDataInShape = checkDirStatus() - } - while (allDataInShape) { - Thread.sleep(5000) - elapseTime += 5000 - if (elapseTime > timeout) { - return - } - allDataInShape = checkDirStatus() - if (!allDataInShape) { - exception = new IllegalStateException("the directory state does not meet" + - " the expected state") - throw exception - } - } - } - } +import org.apache.celeborn.tests.spark.fetch_failure.{FailCommitShuffleReaderGetHook, FetchFailureDiskCleanBase, FileDeletionShuffleReaderGetHook, TestRunningStageManager} - private def triggerStorageCheckThread( - shuffleIdShouldNotExist: Seq[Int], - shuffleIdMustExist: Seq[Int], - sparkSession: SparkSession, - forStableStatusChecking: Boolean): CheckingThread = { - val checkingThread = - if (!forStableStatusChecking) { - new CheckingThread(shuffleIdShouldNotExist, shuffleIdMustExist, sparkSession) - } else { - new CheckingThreadForStableStatus(shuffleIdShouldNotExist, shuffleIdMustExist, sparkSession) - } - checkingThread.setDaemon(true) - checkingThread.start() - checkingThread - } - - private def checkStorageValidation(thread: Thread, timeout: Long = 120 * 1000): Unit = { - val checkingThread = thread.asInstanceOf[CheckingThread] - checkingThread.join(timeout) - if (checkingThread.isAlive || checkingThread.exception != null) { - throw new IllegalStateException("the storage checking status failed," + - s"${checkingThread.isAlive} ${if (checkingThread.exception != null) checkingThread.exception.getMessage - else "NULL"}") - } - } +class CelebornFetchFailureDiskCleanSuite extends FetchFailureDiskCleanBase { // 1. for single level 1-1 lineage, the old disk space is cleaned before the spark application // finish @@ -268,40 +140,6 @@ class CelebornFetchFailureDiskCleanSuite extends AnyFunSuite } } - // 6. for multiple level M - 1 lineage , all failed disk spaces are cleaned - test("celeborn spark integration test - (M-1 dep with multi-level lineage) the failed shuffle files are all cleaned up" + - " correctly") { - if (Spark3OrNewer) { - val sparkSession = createSparkSession(enableFailedShuffleCleaner = true) - val celebornConf = SparkUtils.fromSparkConf(sparkSession.sparkContext.getConf) - val hook = new FileDeletionShuffleReaderGetHook( - celebornConf, - workerDirs, - shuffleIdToBeDeleted = Seq(0, 1, 2, 3), - triggerStageId = Some(4)) - TestCelebornShuffleManager.registerReaderGetHook(hook) - val checkingThread = triggerStorageCheckThread( - Seq(0, 1, 2, 3), - Seq(4, 5, 6, 7), - sparkSession, - forStableStatusChecking = false) - import sparkSession.implicits._ - val df1 = Seq((1, "a"), (2, "b")).toDF("id", "data").groupBy("id").count() - .withColumnRenamed("count", "countId").groupBy("countId").count() - .withColumnRenamed("count", "df1_count") - val df2 = Seq((2, "c"), (3, "d")).toDF("id", "data").groupBy("id").count() - .withColumnRenamed("count", "countId").groupBy("countId").count() - .withColumnRenamed("count", "df2_count") - val tuples = df1.hint("merge").join(df2, "countId").select("*").collect() - checkStorageValidation(checkingThread, timeout = 600 * 1000) - // verify result - assert(hook.executed.get()) - val expect = "[1,2,2]" - assert(tuples.head.toString().equals(expect)) - sparkSession.stop() - } - } - // 7. if the dependency is 1 to M , we should not clean it test("celeborn spark integration test - Do not clean up the shuffle files being referred by more than one stages") { if (Spark3OrNewer) { diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala new file mode 100644 index 00000000000..f2e547e0eeb --- /dev/null +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala @@ -0,0 +1,152 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.celeborn.tests.spark.fetch_failure + +import java.io.File + +import org.apache.spark.sql.SparkSession +import org.scalatest.BeforeAndAfterEach +import org.scalatest.funsuite.AnyFunSuite + +import org.apache.celeborn.client.ShuffleClient +import org.apache.celeborn.service.deploy.worker.Worker +import org.apache.celeborn.spark.FailedShuffleCleaner + +private[tests] trait FetchFailureDiskCleanBase extends AnyFunSuite + with FetchFailureTestBase + with BeforeAndAfterEach { + override def beforeAll(): Unit = { + logInfo("test initialized , setup Celeborn mini cluster") + setupMiniClusterWithRandomPorts(workerNum = 1) + } + + override def beforeEach(): Unit = { + ShuffleClient.reset() + FailedShuffleCleaner.reset() + } + + override def afterEach(): Unit = { + System.gc() + } + + override def createWorker(map: Map[String, String]): Worker = { + val storageDir = createTmpDir() + workerDirs = workerDirs :+ storageDir + super.createWorker(map ++ Map("celeborn.master.heartbeat.worker.timeout" -> "10s"), storageDir) + } + + class CheckingThread( + shuffleIdShouldNotExist: Seq[Int], + shuffleIdMustExist: Seq[Int], + sparkSession: SparkSession) + extends Thread { + var exception: Exception = _ + + protected def checkDirStatus(): Boolean = { + val deletedSuccessfully = shuffleIdShouldNotExist.forall(shuffleId => { + workerDirs.forall(dir => + !new File(s"$dir/celeborn-worker/shuffle_data/" + + s"${sparkSession.sparkContext.applicationId}/$shuffleId").exists()) + }) + val deletedSuccessfullyString = shuffleIdShouldNotExist.map(shuffleId => { + shuffleId.toString + ":" + + workerDirs.map(dir => + !new File(s"$dir/celeborn-worker/shuffle_data/" + + s"${sparkSession.sparkContext.applicationId}/$shuffleId").exists()).toList + }).mkString(",") + val createdSuccessfully = shuffleIdMustExist.forall(shuffleId => { + workerDirs.exists(dir => + new File(s"$dir/celeborn-worker/shuffle_data/" + + s"${sparkSession.sparkContext.applicationId}/$shuffleId").exists()) + }) + val createdSuccessfullyString = shuffleIdMustExist.map(shuffleId => { + shuffleId.toString + ":" + + workerDirs.map(dir => + new File(s"$dir/celeborn-worker/shuffle_data/" + + s"${sparkSession.sparkContext.applicationId}/$shuffleId").exists()).toList + }).mkString(",") + println(s"${deletedSuccessfullyString} \t $createdSuccessfullyString") + deletedSuccessfully && createdSuccessfully + } + + override def run(): Unit = { + var allDataInShape = checkDirStatus() + while (!allDataInShape) { + Thread.sleep(1000) + allDataInShape = checkDirStatus() + } + } + } + + class CheckingThreadForStableStatus( + shuffleIdShouldNotExist: Seq[Int], + shuffleIdMustExist: Seq[Int], + sparkSession: SparkSession) + extends CheckingThread(shuffleIdShouldNotExist, shuffleIdMustExist, sparkSession) { + override def run(): Unit = { + val timeout = 20000 + var elapseTime = 0L + var allDataInShape = checkDirStatus() + while (!allDataInShape) { + Thread.sleep(5000) + println("init state not meet") + allDataInShape = checkDirStatus() + } + while (allDataInShape) { + Thread.sleep(5000) + elapseTime += 5000 + if (elapseTime > timeout) { + return + } + allDataInShape = checkDirStatus() + if (!allDataInShape) { + exception = new IllegalStateException("the directory state does not meet" + + " the expected state") + throw exception + } + } + } + } + + protected def triggerStorageCheckThread( + shuffleIdShouldNotExist: Seq[Int], + shuffleIdMustExist: Seq[Int], + sparkSession: SparkSession, + forStableStatusChecking: Boolean): CheckingThread = { + val checkingThread = + if (!forStableStatusChecking) { + new CheckingThread(shuffleIdShouldNotExist, shuffleIdMustExist, sparkSession) + } else { + new CheckingThreadForStableStatus(shuffleIdShouldNotExist, shuffleIdMustExist, sparkSession) + } + checkingThread.setDaemon(true) + checkingThread.start() + checkingThread + } + + protected def checkStorageValidation(thread: Thread, timeout: Long = 120 * 1000): Unit = { + val checkingThread = thread.asInstanceOf[CheckingThread] + checkingThread.join(timeout) + if (checkingThread.isAlive || checkingThread.exception != null) { + throw new IllegalStateException("the storage checking status failed," + + s"${checkingThread.isAlive} ${if (checkingThread.exception != null) checkingThread.exception.getMessage + else "NULL"}") + } + } + +} From 574bc519d4b62bb0af6e4f265d5773f23d5d314c Mon Sep 17 00:00:00 2001 From: CodingCat Date: Sat, 8 Mar 2025 20:36:36 -0800 Subject: [PATCH 035/120] check more frequently --- .../tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala index f2e547e0eeb..bd2cc461bf1 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala @@ -30,6 +30,7 @@ import org.apache.celeborn.spark.FailedShuffleCleaner private[tests] trait FetchFailureDiskCleanBase extends AnyFunSuite with FetchFailureTestBase with BeforeAndAfterEach { + override def beforeAll(): Unit = { logInfo("test initialized , setup Celeborn mini cluster") setupMiniClusterWithRandomPorts(workerNum = 1) @@ -87,7 +88,7 @@ private[tests] trait FetchFailureDiskCleanBase extends AnyFunSuite override def run(): Unit = { var allDataInShape = checkDirStatus() while (!allDataInShape) { - Thread.sleep(1000) + Thread.sleep(200) allDataInShape = checkDirStatus() } } From 7b9bcb4b896dc8f657c9ff7695503ee6c47c7d51 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Sat, 8 Mar 2025 22:28:08 -0800 Subject: [PATCH 036/120] override shutdown minicluster in expensive suite --- .../CelebornFetchFailureDiskCleanExpensiveSuite.scala | 7 +++++++ .../spark/fetch_failure/FetchFailureDiskCleanBase.scala | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanExpensiveSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanExpensiveSuite.scala index c1d8690bf74..78ab97479c6 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanExpensiveSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanExpensiveSuite.scala @@ -22,6 +22,13 @@ import org.apache.celeborn.tests.spark.fetch_failure.{FetchFailureDiskCleanBase, class CelebornFetchFailureDiskCleanExpensiveSuite extends FetchFailureDiskCleanBase { + override def afterAll(): Unit = { + logInfo("all test complete , stop Celeborn mini cluster") + // to avoid generated files being deleted too quickly + Thread.sleep(60 * 1000) + shutdownMiniCluster() + } + // 6. for multiple level M - 1 lineage , all failed disk spaces are cleaned test("celeborn spark integration test - (M-1 dep with multi-level lineage) the failed shuffle files are all cleaned up" + " correctly") { diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala index bd2cc461bf1..846216087bd 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala @@ -88,7 +88,7 @@ private[tests] trait FetchFailureDiskCleanBase extends AnyFunSuite override def run(): Unit = { var allDataInShape = checkDirStatus() while (!allDataInShape) { - Thread.sleep(200) + Thread.sleep(1000) allDataInShape = checkDirStatus() } } From 0a1dc8074432f3517536330ac4795c5924216486 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Sun, 9 Mar 2025 08:14:04 -0700 Subject: [PATCH 037/120] try persist --- .../apache/celeborn/spark/FailedShuffleCleaner.scala | 1 + .../CelebornFetchFailureDiskCleanExpensiveSuite.scala | 11 +++-------- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala index 6a8e3afa81b..e04ea9b2ee1 100644 --- a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala +++ b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala @@ -133,6 +133,7 @@ private[celeborn] object FailedShuffleCleaner extends Logging { if (!cleanedShuffleIds.contains(shuffleId)) { lifecycleManager.get().unregisterShuffle(shuffleId) logInfo(s"sent unregister shuffle request for shuffle $shuffleId (celeborn shuffle id)") + println(s"sent unregister shuffle request for shuffle $shuffleId (celeborn shuffle id)") cleanedShuffleIds += shuffleId } } diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanExpensiveSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanExpensiveSuite.scala index 78ab97479c6..6c20366409f 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanExpensiveSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanExpensiveSuite.scala @@ -22,13 +22,6 @@ import org.apache.celeborn.tests.spark.fetch_failure.{FetchFailureDiskCleanBase, class CelebornFetchFailureDiskCleanExpensiveSuite extends FetchFailureDiskCleanBase { - override def afterAll(): Unit = { - logInfo("all test complete , stop Celeborn mini cluster") - // to avoid generated files being deleted too quickly - Thread.sleep(60 * 1000) - shutdownMiniCluster() - } - // 6. for multiple level M - 1 lineage , all failed disk spaces are cleaned test("celeborn spark integration test - (M-1 dep with multi-level lineage) the failed shuffle files are all cleaned up" + " correctly") { @@ -53,7 +46,9 @@ class CelebornFetchFailureDiskCleanExpensiveSuite extends FetchFailureDiskCleanB val df2 = Seq((2, "c"), (3, "d")).toDF("id", "data").groupBy("id").count() .withColumnRenamed("count", "countId").groupBy("countId").count() .withColumnRenamed("count", "df2_count") - val tuples = df1.hint("merge").join(df2, "countId").select("*").collect() + val df = df1.hint("merge").join(df2, "countId").select("*").persist() + val tuples = df.collect() + df.collect() checkStorageValidation(checkingThread, timeout = 600 * 1000) // verify result assert(hook.executed.get()) From 9fb22d9dc8b3786419fa3191953ba05bc2d4688a Mon Sep 17 00:00:00 2001 From: CodingCat Date: Sun, 9 Mar 2025 11:04:17 -0700 Subject: [PATCH 038/120] move back test and see --- ...nFetchFailureDiskCleanExpensiveSuite.scala | 61 ------------------- .../CelebornFetchFailureDiskCleanSuite.scala | 36 +++++++++++ 2 files changed, 36 insertions(+), 61 deletions(-) delete mode 100644 tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanExpensiveSuite.scala diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanExpensiveSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanExpensiveSuite.scala deleted file mode 100644 index 6c20366409f..00000000000 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanExpensiveSuite.scala +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.celeborn.tests.spark - -import org.apache.spark.shuffle.celeborn.{SparkUtils, TestCelebornShuffleManager} - -import org.apache.celeborn.tests.spark.fetch_failure.{FetchFailureDiskCleanBase, FileDeletionShuffleReaderGetHook} - -class CelebornFetchFailureDiskCleanExpensiveSuite extends FetchFailureDiskCleanBase { - - // 6. for multiple level M - 1 lineage , all failed disk spaces are cleaned - test("celeborn spark integration test - (M-1 dep with multi-level lineage) the failed shuffle files are all cleaned up" + - " correctly") { - if (Spark3OrNewer) { - val sparkSession = createSparkSession(enableFailedShuffleCleaner = true) - val celebornConf = SparkUtils.fromSparkConf(sparkSession.sparkContext.getConf) - val hook = new FileDeletionShuffleReaderGetHook( - celebornConf, - workerDirs, - shuffleIdToBeDeleted = Seq(0, 1, 2, 3), - triggerStageId = Some(4)) - TestCelebornShuffleManager.registerReaderGetHook(hook) - val checkingThread = triggerStorageCheckThread( - Seq(0, 1, 2, 3), - Seq(4, 5, 6, 7), - sparkSession, - forStableStatusChecking = false) - import sparkSession.implicits._ - val df1 = Seq((1, "a"), (2, "b")).toDF("id", "data").groupBy("id").count() - .withColumnRenamed("count", "countId").groupBy("countId").count() - .withColumnRenamed("count", "df1_count") - val df2 = Seq((2, "c"), (3, "d")).toDF("id", "data").groupBy("id").count() - .withColumnRenamed("count", "countId").groupBy("countId").count() - .withColumnRenamed("count", "df2_count") - val df = df1.hint("merge").join(df2, "countId").select("*").persist() - val tuples = df.collect() - df.collect() - checkStorageValidation(checkingThread, timeout = 600 * 1000) - // verify result - assert(hook.executed.get()) - val expect = "[1,2,2]" - assert(tuples.head.toString().equals(expect)) - sparkSession.stop() - } - } - -} diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala index 823f9784ac5..8fe0331a843 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala @@ -140,6 +140,42 @@ class CelebornFetchFailureDiskCleanSuite extends FetchFailureDiskCleanBase { } } + // 6. for multiple level M - 1 lineage , all failed disk spaces are cleaned + test("celeborn spark integration test - (M-1 dep with multi-level lineage) the failed shuffle files are all cleaned up" + + " correctly") { + if (Spark3OrNewer) { + val sparkSession = createSparkSession(enableFailedShuffleCleaner = true) + val celebornConf = SparkUtils.fromSparkConf(sparkSession.sparkContext.getConf) + val hook = new FileDeletionShuffleReaderGetHook( + celebornConf, + workerDirs, + shuffleIdToBeDeleted = Seq(0, 1, 2, 3), + triggerStageId = Some(4)) + TestCelebornShuffleManager.registerReaderGetHook(hook) + val checkingThread = triggerStorageCheckThread( + Seq(0, 1, 2, 3), + Seq(4, 5, 6, 7), + sparkSession, + forStableStatusChecking = false) + import sparkSession.implicits._ + val df1 = Seq((1, "a"), (2, "b")).toDF("id", "data").groupBy("id").count() + .withColumnRenamed("count", "countId").groupBy("countId").count() + .withColumnRenamed("count", "df1_count") + val df2 = Seq((2, "c"), (3, "d")).toDF("id", "data").groupBy("id").count() + .withColumnRenamed("count", "countId").groupBy("countId").count() + .withColumnRenamed("count", "df2_count") + val df = df1.hint("merge").join(df2, "countId").select("*").persist() + val tuples = df.collect() + df.collect() + checkStorageValidation(checkingThread, timeout = 600 * 1000) + // verify result + assert(hook.executed.get()) + val expect = "[1,2,2]" + assert(tuples.head.toString().equals(expect)) + sparkSession.stop() + } + } + // 7. if the dependency is 1 to M , we should not clean it test("celeborn spark integration test - Do not clean up the shuffle files being referred by more than one stages") { if (Spark3OrNewer) { From 14c53a5e7d1ca2a5d6a3fb01df0001274c934dbd Mon Sep 17 00:00:00 2001 From: CodingCat Date: Sun, 9 Mar 2025 12:59:02 -0700 Subject: [PATCH 039/120] Revert "move back test and see" This reverts commit dfa83dd913f32b0b8d70fe0fc3d06679d255abe7. --- ...nFetchFailureDiskCleanExpensiveSuite.scala | 61 +++++++++++++++++++ .../CelebornFetchFailureDiskCleanSuite.scala | 36 ----------- 2 files changed, 61 insertions(+), 36 deletions(-) create mode 100644 tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanExpensiveSuite.scala diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanExpensiveSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanExpensiveSuite.scala new file mode 100644 index 00000000000..6c20366409f --- /dev/null +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanExpensiveSuite.scala @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.celeborn.tests.spark + +import org.apache.spark.shuffle.celeborn.{SparkUtils, TestCelebornShuffleManager} + +import org.apache.celeborn.tests.spark.fetch_failure.{FetchFailureDiskCleanBase, FileDeletionShuffleReaderGetHook} + +class CelebornFetchFailureDiskCleanExpensiveSuite extends FetchFailureDiskCleanBase { + + // 6. for multiple level M - 1 lineage , all failed disk spaces are cleaned + test("celeborn spark integration test - (M-1 dep with multi-level lineage) the failed shuffle files are all cleaned up" + + " correctly") { + if (Spark3OrNewer) { + val sparkSession = createSparkSession(enableFailedShuffleCleaner = true) + val celebornConf = SparkUtils.fromSparkConf(sparkSession.sparkContext.getConf) + val hook = new FileDeletionShuffleReaderGetHook( + celebornConf, + workerDirs, + shuffleIdToBeDeleted = Seq(0, 1, 2, 3), + triggerStageId = Some(4)) + TestCelebornShuffleManager.registerReaderGetHook(hook) + val checkingThread = triggerStorageCheckThread( + Seq(0, 1, 2, 3), + Seq(4, 5, 6, 7), + sparkSession, + forStableStatusChecking = false) + import sparkSession.implicits._ + val df1 = Seq((1, "a"), (2, "b")).toDF("id", "data").groupBy("id").count() + .withColumnRenamed("count", "countId").groupBy("countId").count() + .withColumnRenamed("count", "df1_count") + val df2 = Seq((2, "c"), (3, "d")).toDF("id", "data").groupBy("id").count() + .withColumnRenamed("count", "countId").groupBy("countId").count() + .withColumnRenamed("count", "df2_count") + val df = df1.hint("merge").join(df2, "countId").select("*").persist() + val tuples = df.collect() + df.collect() + checkStorageValidation(checkingThread, timeout = 600 * 1000) + // verify result + assert(hook.executed.get()) + val expect = "[1,2,2]" + assert(tuples.head.toString().equals(expect)) + sparkSession.stop() + } + } + +} diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala index 8fe0331a843..823f9784ac5 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala @@ -140,42 +140,6 @@ class CelebornFetchFailureDiskCleanSuite extends FetchFailureDiskCleanBase { } } - // 6. for multiple level M - 1 lineage , all failed disk spaces are cleaned - test("celeborn spark integration test - (M-1 dep with multi-level lineage) the failed shuffle files are all cleaned up" + - " correctly") { - if (Spark3OrNewer) { - val sparkSession = createSparkSession(enableFailedShuffleCleaner = true) - val celebornConf = SparkUtils.fromSparkConf(sparkSession.sparkContext.getConf) - val hook = new FileDeletionShuffleReaderGetHook( - celebornConf, - workerDirs, - shuffleIdToBeDeleted = Seq(0, 1, 2, 3), - triggerStageId = Some(4)) - TestCelebornShuffleManager.registerReaderGetHook(hook) - val checkingThread = triggerStorageCheckThread( - Seq(0, 1, 2, 3), - Seq(4, 5, 6, 7), - sparkSession, - forStableStatusChecking = false) - import sparkSession.implicits._ - val df1 = Seq((1, "a"), (2, "b")).toDF("id", "data").groupBy("id").count() - .withColumnRenamed("count", "countId").groupBy("countId").count() - .withColumnRenamed("count", "df1_count") - val df2 = Seq((2, "c"), (3, "d")).toDF("id", "data").groupBy("id").count() - .withColumnRenamed("count", "countId").groupBy("countId").count() - .withColumnRenamed("count", "df2_count") - val df = df1.hint("merge").join(df2, "countId").select("*").persist() - val tuples = df.collect() - df.collect() - checkStorageValidation(checkingThread, timeout = 600 * 1000) - // verify result - assert(hook.executed.get()) - val expect = "[1,2,2]" - assert(tuples.head.toString().equals(expect)) - sparkSession.stop() - } - } - // 7. if the dependency is 1 to M , we should not clean it test("celeborn spark integration test - Do not clean up the shuffle files being referred by more than one stages") { if (Spark3OrNewer) { From 633fc2a9c1cb01371813c047fe1fc3e75e022781 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Fri, 4 Apr 2025 09:32:18 -0700 Subject: [PATCH 040/120] addr comments1 --- .../celeborn/spark/FailedShuffleCleaner.scala | 8 +++++--- .../shuffle/celeborn/SparkShuffleManager.java | 14 ++++++++++++++ .../apache/spark/shuffle/celeborn/SparkUtils.java | 6 ++++++ .../apache/celeborn/client/LifecycleManager.scala | 6 ++++++ 4 files changed, 31 insertions(+), 3 deletions(-) diff --git a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala index e04ea9b2ee1..dddf7d77830 100644 --- a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala +++ b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala @@ -85,12 +85,11 @@ private[celeborn] object FailedShuffleCleaner extends Logging { appShuffleIdentifier: String): Unit = { val Array(appShuffleId, stageId, _) = appShuffleIdentifier.split('-') lifecycleManager.get().getShuffleIdMapping.get(appShuffleId.toInt).foreach { - case (pastAppShuffleIdentifier, (celebornShuffleId, _)) => { + case (_, (celebornShuffleId, _)) => { if (!celebornShuffleIdToReferringStages.containsKey(celebornShuffleId) || onlyCurrentStageReferred(celebornShuffleId, stageId.toInt) || noRunningDownstreamStage(celebornShuffleId) || !committedSuccessfully(celebornShuffleId)) { - val Array(_, stageId, attemptId) = pastAppShuffleIdentifier.split('-') shufflesToBeCleand.put(celebornShuffleId) } } @@ -110,6 +109,10 @@ private[celeborn] object FailedShuffleCleaner extends Logging { lifecycleManager.compareAndSet(null, ref) } + def removeCleanedShuffleId(celebornShuffleId: Int): Unit = { + cleanedShuffleIds.remove(celebornShuffleId) + } + private def noRunningDownstreamStage(shuffleId: Int): Boolean = { val allReferringStageIds = celebornShuffleIdToReferringStages.get(shuffleId) require(allReferringStageIds != null, s"no stage referring to shuffle $shuffleId") @@ -133,7 +136,6 @@ private[celeborn] object FailedShuffleCleaner extends Logging { if (!cleanedShuffleIds.contains(shuffleId)) { lifecycleManager.get().unregisterShuffle(shuffleId) logInfo(s"sent unregister shuffle request for shuffle $shuffleId (celeborn shuffle id)") - println(s"sent unregister shuffle request for shuffle $shuffleId (celeborn shuffle id)") cleanedShuffleIds += shuffleId } } diff --git a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java index 77acac9862c..9aae5e0edad 100644 --- a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java +++ b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java @@ -156,6 +156,20 @@ private void initializeLifecycleManager(String appId) { } } + if (lifecycleManager.conf().clientFetchCleanFailedShuffle()) { + lifecycleManager.registerGetShuffleIdForWriterCallback( + (celebornShuffleId, appShuffleIdentifier) -> + SparkUtils.addWriterShuffleIdsToBeCleaned( + lifecycleManager, celebornShuffleId, appShuffleIdentifier)); + lifecycleManager.registerGetShuffleIdForReaderCallback( + (celebornShuffleId, appShuffleIdentifier) -> + SparkUtils.addShuffleIdRefCount( + lifecycleManager, celebornShuffleId, appShuffleIdentifier)); + lifecycleManager.registerUnregisterShuffleCallback( + (celebornShuffleId) -> + SparkUtils.removeCleanedShuffleId(lifecycleManager, celebornShuffleId)); + } + if (celebornConf.getReducerFileGroupBroadcastEnabled()) { lifecycleManager.registerBroadcastGetReducerFileGroupResponseCallback( (shuffleId, getReducerFileGroupResponse) -> diff --git a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java index ad173adb922..048e4932ad2 100644 --- a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java +++ b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java @@ -640,4 +640,10 @@ public static void addShuffleIdRefCount( FailedShuffleCleaner.setLifecycleManager(lifecycleManager); FailedShuffleCleaner.addShuffleIdReferringStage(celebornShuffeId, appShuffleIdentifier); } + + public static void removeCleanedShuffleId( + LifecycleManager lifecycleManager, int celebornShuffleId) { + FailedShuffleCleaner.setLifecycleManager(lifecycleManager); + FailedShuffleCleaner.removeCleanedShuffleId(celebornShuffleId); + } } diff --git a/client/src/main/scala/org/apache/celeborn/client/LifecycleManager.scala b/client/src/main/scala/org/apache/celeborn/client/LifecycleManager.scala index 110ff1c1612..932b45e9c44 100644 --- a/client/src/main/scala/org/apache/celeborn/client/LifecycleManager.scala +++ b/client/src/main/scala/org/apache/celeborn/client/LifecycleManager.scala @@ -1165,6 +1165,7 @@ class LifecycleManager(val appUniqueId: String, val conf: CelebornConf) extends shuffleIds.values.map { case (shuffleId, _) => unregisterShuffle(shuffleId) + unregisterShuffleCallback.foreach(c => c.accept(shuffleId)) }) } } else { @@ -1866,6 +1867,11 @@ class LifecycleManager(val appUniqueId: String, val conf: CelebornConf) extends getShuffleIdForReaderCallback = Some(callback) } + @volatile private var unregisterShuffleCallback: Option[Consumer[Integer]] = None + def registerUnregisterShuffleCallback(callback: Consumer[Integer]): Unit = { + unregisterShuffleCallback = Some(callback) + } + def registerAppShuffleDeterminate(appShuffleId: Int, determinate: Boolean): Unit = { appShuffleDeterminateMap.put(appShuffleId, determinate) } From c5395337bf35effaa306b062c692ec869f0c0950 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Fri, 4 Apr 2025 10:22:20 -0700 Subject: [PATCH 041/120] addr comments 2 --- .../celeborn/spark/FailedShuffleCleaner.scala | 49 +++++++++---------- .../shuffle/celeborn/SparkShuffleManager.java | 1 - .../spark/shuffle/celeborn/SparkUtils.java | 1 - .../apache/celeborn/common/CelebornConf.scala | 11 +++++ 4 files changed, 35 insertions(+), 27 deletions(-) diff --git a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala index dddf7d77830..a14c848c567 100644 --- a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala +++ b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala @@ -16,7 +16,7 @@ */ package org.apache.celeborn.spark import java.util -import java.util.concurrent.{ConcurrentHashMap, LinkedBlockingQueue} +import java.util.concurrent.{ConcurrentHashMap, LinkedBlockingQueue, TimeUnit} import java.util.concurrent.atomic.AtomicReference import scala.collection.JavaConverters._ @@ -26,6 +26,7 @@ import org.apache.spark.scheduler.{RunningStageManager, RunningStageManagerImpl} import org.apache.celeborn.client.LifecycleManager import org.apache.celeborn.common.internal.Logging +import org.apache.celeborn.common.util.ThreadUtils private[celeborn] object FailedShuffleCleaner extends Logging { @@ -113,38 +114,36 @@ private[celeborn] object FailedShuffleCleaner extends Logging { cleanedShuffleIds.remove(celebornShuffleId) } - private def noRunningDownstreamStage(shuffleId: Int): Boolean = { - val allReferringStageIds = celebornShuffleIdToReferringStages.get(shuffleId) - require(allReferringStageIds != null, s"no stage referring to shuffle $shuffleId") + private def noRunningDownstreamStage(celebornShuffleId: Int): Boolean = { + val allReferringStageIds = celebornShuffleIdToReferringStages.get(celebornShuffleId) + require(allReferringStageIds != null, s"no stage referring to shuffle $celebornShuffleId") val ret = allReferringStageIds.count(stageId => runningStageManager.isRunningStage(stageId)) == 0 if (ret) { - logInfo(s"no running downstream stages refers to $shuffleId") + logInfo(s"no running downstream stages refers to $celebornShuffleId") } else { - logInfo(s"there is more than one running downstream stage referring to shuffle $shuffleId," + - s" ignore it for cleanup ") + logInfo( + s"there is more than one running downstream stage referring to shuffle $celebornShuffleId," + + s" ignore it for cleanup ") } ret } - private val cleanerThread = new Thread() { - override def run(): Unit = { - while (true) { - val allShuffleIds = new util.ArrayList[Int] - shufflesToBeCleand.drainTo(allShuffleIds) - allShuffleIds.asScala.foreach { shuffleId => - if (!cleanedShuffleIds.contains(shuffleId)) { - lifecycleManager.get().unregisterShuffle(shuffleId) - logInfo(s"sent unregister shuffle request for shuffle $shuffleId (celeborn shuffle id)") - cleanedShuffleIds += shuffleId - } + private val cleanerThreadPool = ThreadUtils.newDaemonSingleThreadScheduledExecutor( + "failedShuffleCleanerThreadPool") + cleanerThreadPool.scheduleWithFixedDelay( + () => { + val allShuffleIds = new util.ArrayList[Int] + shufflesToBeCleand.drainTo(allShuffleIds) + allShuffleIds.asScala.foreach { shuffleId => + if (!cleanedShuffleIds.contains(shuffleId)) { + lifecycleManager.get().unregisterShuffle(shuffleId) + logInfo(s"sent unregister shuffle request for shuffle $shuffleId (celeborn shuffle id)") + cleanedShuffleIds += shuffleId } - Thread.sleep(1000) } - } - } - - cleanerThread.setName("shuffle cleaner thread") - cleanerThread.setDaemon(true) - cleanerThread.start() + }, + lifecycleManager.get().conf.clientFetchCleanFailedShuffleIntervalMS, + lifecycleManager.get().conf.clientFetchCleanFailedShuffleIntervalMS, + TimeUnit.SECONDS) } diff --git a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java index 9aae5e0edad..a745c9a72cc 100644 --- a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java +++ b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java @@ -177,7 +177,6 @@ private void initializeLifecycleManager(String appId) { shuffleId, getReducerFileGroupResponse)); lifecycleManager.registerInvalidatedBroadcastCallback( shuffleId -> SparkUtils.invalidateSerializedGetReducerFileGroupResponse(shuffleId)); - } if (lifecycleManager.conf().clientFetchCleanFailedShuffle()) { lifecycleManager.registerGetShuffleIdForWriterCallback( diff --git a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java index 048e4932ad2..40862657641 100644 --- a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java +++ b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java @@ -45,7 +45,6 @@ import org.apache.spark.broadcast.Broadcast; import org.apache.spark.io.CompressionCodec; import org.apache.spark.io.CompressionCodec$; -import org.apache.spark.celeborn.spark.FailedShuffleCleaner; import org.apache.spark.scheduler.DAGScheduler; import org.apache.spark.scheduler.MapStatus; import org.apache.spark.scheduler.MapStatus$; diff --git a/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala b/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala index 5a0e3cc764f..ec848dda517 100644 --- a/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala +++ b/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala @@ -998,6 +998,8 @@ class CelebornConf(loadDefaults: Boolean) extends Cloneable with Logging with Se def clientFetchMaxRetriesForEachReplica: Int = get(CLIENT_FETCH_MAX_RETRIES_FOR_EACH_REPLICA) def clientStageRerunEnabled: Boolean = get(CLIENT_STAGE_RERUN_ENABLED) def clientFetchCleanFailedShuffle: Boolean = get(CLIENT_FETCH_CLEAN_FAILED_SHUFFLE) + def clientFetchCleanFailedShuffleIntervalMS: Long = + get(CLIENT_FETCH_CLEAN_FAILED_SHUFFLE_INTERVAL) def clientFetchExcludeWorkerOnFailureEnabled: Boolean = get(CLIENT_FETCH_EXCLUDE_WORKER_ON_FAILURE_ENABLED) def clientFetchExcludedWorkerExpireTimeout: Long = @@ -4822,6 +4824,15 @@ object CelebornConf extends Logging { .booleanConf .createWithDefault(false) + val CLIENT_FETCH_CLEAN_FAILED_SHUFFLE_INTERVAL: ConfigEntry[Long] = + buildConf("celeborn.client.spark.fetch.cleanFailedShuffleIntervalMs") + .categories("client") + .version("0.6.0") + .doc("the interval to clean the failed-to-fetch shuffle files, only valid when" + + s" ${CLIENT_FETCH_CLEAN_FAILED_SHUFFLE.key} is enabled") + .longConf + .createWithDefault(1000) + val CLIENT_FETCH_EXCLUDE_WORKER_ON_FAILURE_ENABLED: ConfigEntry[Boolean] = buildConf("celeborn.client.fetch.excludeWorkerOnFailure.enabled") .categories("client") From 142966f2cde47eb5517a268a062a922a869ee3c0 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Fri, 4 Apr 2025 21:25:34 -0700 Subject: [PATCH 042/120] addr comments 3 --- .../celeborn/spark/FailedShuffleCleaner.scala | 16 ++++++---------- .../celeborn/client/LifecycleManager.scala | 13 +++++++------ 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala index a14c848c567..663e22e72b7 100644 --- a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala +++ b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala @@ -84,16 +84,12 @@ private[celeborn] object FailedShuffleCleaner extends Logging { def addShuffleIdToBeCleaned( celebornShuffleId: Int, appShuffleIdentifier: String): Unit = { - val Array(appShuffleId, stageId, _) = appShuffleIdentifier.split('-') - lifecycleManager.get().getShuffleIdMapping.get(appShuffleId.toInt).foreach { - case (_, (celebornShuffleId, _)) => { - if (!celebornShuffleIdToReferringStages.containsKey(celebornShuffleId) - || onlyCurrentStageReferred(celebornShuffleId, stageId.toInt) - || noRunningDownstreamStage(celebornShuffleId) - || !committedSuccessfully(celebornShuffleId)) { - shufflesToBeCleand.put(celebornShuffleId) - } - } + val Array(_, stageId, _) = appShuffleIdentifier.split('-') + if (!celebornShuffleIdToReferringStages.containsKey(celebornShuffleId) + || onlyCurrentStageReferred(celebornShuffleId, stageId.toInt) + || noRunningDownstreamStage(celebornShuffleId) + || !committedSuccessfully(celebornShuffleId)) { + shufflesToBeCleand.put(celebornShuffleId) } } diff --git a/client/src/main/scala/org/apache/celeborn/client/LifecycleManager.scala b/client/src/main/scala/org/apache/celeborn/client/LifecycleManager.scala index 932b45e9c44..3e31f238a3d 100644 --- a/client/src/main/scala/org/apache/celeborn/client/LifecycleManager.scala +++ b/client/src/main/scala/org/apache/celeborn/client/LifecycleManager.scala @@ -938,7 +938,7 @@ class LifecycleManager(val appUniqueId: String, val conf: CelebornConf) extends } val newShuffleId = shuffleIdGenerator.getAndIncrement() logInfo(s"generate new shuffleId $newShuffleId for appShuffleId $appShuffleId appShuffleIdentifier $appShuffleIdentifier") - getShuffleIdForWriterCallback.foreach(callback => + validateCelebornShuffleIdForClean.foreach(callback => callback.accept(newShuffleId, appShuffleIdentifier)) shuffleIds.put(appShuffleIdentifier, (newShuffleId, true)) newShuffleId @@ -954,7 +954,7 @@ class LifecycleManager(val appUniqueId: String, val conf: CelebornConf) extends shuffleIds.values.filter(v => v._2).map(v => v._1).toSeq.reverse.find( areAllMapTasksEnd) match { case Some(celebornShuffleId) => - getShuffleIdForReaderCallback.foreach(callback => + recordShuffleIdReference.foreach(callback => callback.accept(celebornShuffleId, appShuffleIdentifier)) val pbGetShuffleIdResponse = { logDebug( @@ -1857,14 +1857,15 @@ class LifecycleManager(val appUniqueId: String, val conf: CelebornConf) extends } // expecting celeborn shuffle id and application shuffle identifier - @volatile private var getShuffleIdForWriterCallback: Option[BiConsumer[Integer, String]] = None + @volatile private var validateCelebornShuffleIdForClean: Option[BiConsumer[Integer, String]] = + None def registerGetShuffleIdForWriterCallback(callback: BiConsumer[Integer, String]): Unit = { - getShuffleIdForWriterCallback = Some(callback) + validateCelebornShuffleIdForClean = Some(callback) } // expecting celeborn shuffle id and application shuffle identifier - @volatile private var getShuffleIdForReaderCallback: Option[BiConsumer[Integer, String]] = None + @volatile private var recordShuffleIdReference: Option[BiConsumer[Integer, String]] = None def registerGetShuffleIdForReaderCallback(callback: BiConsumer[Integer, String]): Unit = { - getShuffleIdForReaderCallback = Some(callback) + recordShuffleIdReference = Some(callback) } @volatile private var unregisterShuffleCallback: Option[Consumer[Integer]] = None From ed8ebff7ac053fb562074c7bdac9a98f9ea16ae0 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Fri, 4 Apr 2025 21:53:33 -0700 Subject: [PATCH 043/120] fix compilation --- .../celeborn/client/commit/ReducePartitionCommitHandler.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/client/src/main/scala/org/apache/celeborn/client/commit/ReducePartitionCommitHandler.scala b/client/src/main/scala/org/apache/celeborn/client/commit/ReducePartitionCommitHandler.scala index 810418931ca..22c8bf67b04 100644 --- a/client/src/main/scala/org/apache/celeborn/client/commit/ReducePartitionCommitHandler.scala +++ b/client/src/main/scala/org/apache/celeborn/client/commit/ReducePartitionCommitHandler.scala @@ -69,7 +69,7 @@ class ReducePartitionCommitHandler( class MultiSerdeVersionRpcContext(val ctx: RpcCallContext, val serdeVersion: SerdeVersion) {} private val getReducerFileGroupRequest = - JavaUtils.newConcurrentHashMap[Int, util.Set[RpcCallContext]]() + JavaUtils.newConcurrentHashMap[Int, util.Set[MultiSerdeVersionRpcContext]]() private[celeborn] val dataLostShuffleSet = ConcurrentHashMap.newKeySet[Int]() private val stageEndShuffleSet = ConcurrentHashMap.newKeySet[Int]() private val inProcessStageEndShuffleSet = ConcurrentHashMap.newKeySet[Int]() From 80c397f4d42a56e3317e47dbde34401d9551f354 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Fri, 4 Apr 2025 21:59:55 -0700 Subject: [PATCH 044/120] use runnable to be compatible with spark 2 --- .../celeborn/spark/FailedShuffleCleaner.scala | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala index 663e22e72b7..a7268eee0b8 100644 --- a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala +++ b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala @@ -128,14 +128,16 @@ private[celeborn] object FailedShuffleCleaner extends Logging { private val cleanerThreadPool = ThreadUtils.newDaemonSingleThreadScheduledExecutor( "failedShuffleCleanerThreadPool") cleanerThreadPool.scheduleWithFixedDelay( - () => { - val allShuffleIds = new util.ArrayList[Int] - shufflesToBeCleand.drainTo(allShuffleIds) - allShuffleIds.asScala.foreach { shuffleId => - if (!cleanedShuffleIds.contains(shuffleId)) { - lifecycleManager.get().unregisterShuffle(shuffleId) - logInfo(s"sent unregister shuffle request for shuffle $shuffleId (celeborn shuffle id)") - cleanedShuffleIds += shuffleId + new Runnable { + override def run(): Unit = { + val allShuffleIds = new util.ArrayList[Int] + shufflesToBeCleand.drainTo(allShuffleIds) + allShuffleIds.asScala.foreach { shuffleId => + if (!cleanedShuffleIds.contains(shuffleId)) { + lifecycleManager.get().unregisterShuffle(shuffleId) + logInfo(s"sent unregister shuffle request for shuffle $shuffleId (celeborn shuffle id)") + cleanedShuffleIds += shuffleId + } } } }, From 2476f3aaad1eb2a13a8a4888f81e5ee923692663 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Fri, 4 Apr 2025 22:35:58 -0700 Subject: [PATCH 045/120] update param doc --- docs/configuration/client.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/configuration/client.md b/docs/configuration/client.md index 8e8f6c03a2c..612844989c5 100644 --- a/docs/configuration/client.md +++ b/docs/configuration/client.md @@ -112,6 +112,7 @@ license: | | celeborn.client.shuffle.reviseLostShuffles.enabled | false | false | Whether to revise lost shuffles. | 0.6.0 | | | celeborn.client.slot.assign.maxWorkers | 10000 | false | Max workers that slots of one shuffle can be allocated on. Will choose the smaller positive one from Master side and Client side, see `celeborn.master.slot.assign.maxWorkers`. | 0.3.1 | | | celeborn.client.spark.fetch.cleanFailedShuffle | false | false | whether to clean those disk space occupied by shuffles which cannot be fetched | 0.6.0 | | +| celeborn.client.spark.fetch.cleanFailedShuffleIntervalMs | 1000 | false | the interval to clean the failed-to-fetch shuffle files, only valid when celeborn.client.spark.fetch.cleanFailedShuffle is enabled | 0.6.0 | | | celeborn.client.spark.push.dynamicWriteMode.enabled | false | false | Whether to dynamically switch push write mode based on conditions.If true, shuffle mode will be only determined by partition count | 0.5.0 | | | celeborn.client.spark.push.dynamicWriteMode.partitionNum.threshold | 2000 | false | Threshold of shuffle partition number for dynamically switching push writer mode. When the shuffle partition number is greater than this value, use the sort-based shuffle writer for memory efficiency; otherwise use the hash-based shuffle writer for speed. This configuration only takes effect when celeborn.client.spark.push.dynamicWriteMode.enabled is true. | 0.5.0 | | | celeborn.client.spark.push.sort.memory.maxMemoryFactor | 0.4 | false | the max portion of executor memory which can be used for SortBasedWriter buffer (only valid when celeborn.client.spark.push.sort.memory.useAdaptiveThreshold is enabled | 0.5.0 | | From 9d705c36650ee5f10f860160a5823fdfb69f563f Mon Sep 17 00:00:00 2001 From: CodingCat Date: Fri, 4 Apr 2025 22:41:39 -0700 Subject: [PATCH 046/120] fix NPE --- .../celeborn/spark/FailedShuffleCleaner.scala | 44 +++++++++++-------- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala index a7268eee0b8..0453438126d 100644 --- a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala +++ b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala @@ -38,6 +38,11 @@ private[celeborn] object FailedShuffleCleaner extends Logging { private[celeborn] val celebornShuffleIdToReferringStages = new ConcurrentHashMap[Int, mutable.HashSet[Int]]() + private lazy val cleanInterval = + lifecycleManager.get().conf.clientFetchCleanFailedShuffleIntervalMS + + private var initialized = false + private val lock = new Object val RUNNING_STAGE_CHECKER_CLASS = "CELEBORN_TEST_RUNNING_STAGE_CHECKER_IMPL" @@ -103,7 +108,27 @@ private[celeborn] object FailedShuffleCleaner extends Logging { } def setLifecycleManager(ref: LifecycleManager): Unit = { - lifecycleManager.compareAndSet(null, ref) + val firstSet = lifecycleManager.compareAndSet(null, ref) + if (firstSet) { + cleanerThreadPool.scheduleWithFixedDelay( + new Runnable { + override def run(): Unit = { + val allShuffleIds = new util.ArrayList[Int] + shufflesToBeCleand.drainTo(allShuffleIds) + allShuffleIds.asScala.foreach { shuffleId => + if (!cleanedShuffleIds.contains(shuffleId)) { + lifecycleManager.get().unregisterShuffle(shuffleId) + logInfo( + s"sent unregister shuffle request for shuffle $shuffleId (celeborn shuffle id)") + cleanedShuffleIds += shuffleId + } + } + } + }, + cleanInterval, + cleanInterval, + TimeUnit.SECONDS) + } } def removeCleanedShuffleId(celebornShuffleId: Int): Unit = { @@ -127,21 +152,4 @@ private[celeborn] object FailedShuffleCleaner extends Logging { private val cleanerThreadPool = ThreadUtils.newDaemonSingleThreadScheduledExecutor( "failedShuffleCleanerThreadPool") - cleanerThreadPool.scheduleWithFixedDelay( - new Runnable { - override def run(): Unit = { - val allShuffleIds = new util.ArrayList[Int] - shufflesToBeCleand.drainTo(allShuffleIds) - allShuffleIds.asScala.foreach { shuffleId => - if (!cleanedShuffleIds.contains(shuffleId)) { - lifecycleManager.get().unregisterShuffle(shuffleId) - logInfo(s"sent unregister shuffle request for shuffle $shuffleId (celeborn shuffle id)") - cleanedShuffleIds += shuffleId - } - } - } - }, - lifecycleManager.get().conf.clientFetchCleanFailedShuffleIntervalMS, - lifecycleManager.get().conf.clientFetchCleanFailedShuffleIntervalMS, - TimeUnit.SECONDS) } From 0f45cb8767f8474d69ab01dc7d7f0010581b6e2d Mon Sep 17 00:00:00 2001 From: CodingCat Date: Sat, 5 Apr 2025 17:01:43 -0700 Subject: [PATCH 047/120] fix tests --- .../celeborn/spark/FailedShuffleCleaner.scala | 29 ++++++++++--------- .../shuffle/celeborn/SparkShuffleManager.java | 20 ++++++++----- .../spark/shuffle/celeborn/SparkUtils.java | 4 +-- .../celeborn/client/LifecycleManager.scala | 9 +++--- 4 files changed, 35 insertions(+), 27 deletions(-) diff --git a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala index 0453438126d..e8f674d8f95 100644 --- a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala +++ b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala @@ -41,8 +41,6 @@ private[celeborn] object FailedShuffleCleaner extends Logging { private lazy val cleanInterval = lifecycleManager.get().conf.clientFetchCleanFailedShuffleIntervalMS - private var initialized = false - private val lock = new Object val RUNNING_STAGE_CHECKER_CLASS = "CELEBORN_TEST_RUNNING_STAGE_CHECKER_IMPL" @@ -66,6 +64,9 @@ private[celeborn] object FailedShuffleCleaner extends Logging { cleanedShuffleIds.clear() celebornShuffleIdToReferringStages.clear() runningStageManager = buildRunningStageChecker() + cleanerThreadPool.shutdownNow() + cleanerThreadPool = ThreadUtils.newDaemonSingleThreadScheduledExecutor( + "failedShuffleCleanerThreadPool") } def addShuffleIdReferringStage(celebornShuffleId: Int, appShuffleIdentifier: String): Unit = { @@ -86,15 +87,17 @@ private[celeborn] object FailedShuffleCleaner extends Logging { ret } - def addShuffleIdToBeCleaned( - celebornShuffleId: Int, - appShuffleIdentifier: String): Unit = { - val Array(_, stageId, _) = appShuffleIdentifier.split('-') - if (!celebornShuffleIdToReferringStages.containsKey(celebornShuffleId) - || onlyCurrentStageReferred(celebornShuffleId, stageId.toInt) - || noRunningDownstreamStage(celebornShuffleId) - || !committedSuccessfully(celebornShuffleId)) { - shufflesToBeCleand.put(celebornShuffleId) + def addShuffleIdToBeCleaned(appShuffleIdentifier: String): Unit = { + val Array(appShuffleId, stageId, _) = appShuffleIdentifier.split('-') + lifecycleManager.get().getShuffleIdMapping.get(appShuffleId.toInt).foreach { + case (_, (celebornShuffleId, _)) => { + if (!celebornShuffleIdToReferringStages.containsKey(celebornShuffleId) + || onlyCurrentStageReferred(celebornShuffleId, stageId.toInt) + || noRunningDownstreamStage(celebornShuffleId) + || !committedSuccessfully(celebornShuffleId)) { + shufflesToBeCleand.put(celebornShuffleId) + } + } } } @@ -127,7 +130,7 @@ private[celeborn] object FailedShuffleCleaner extends Logging { }, cleanInterval, cleanInterval, - TimeUnit.SECONDS) + TimeUnit.MILLISECONDS) } } @@ -150,6 +153,6 @@ private[celeborn] object FailedShuffleCleaner extends Logging { ret } - private val cleanerThreadPool = ThreadUtils.newDaemonSingleThreadScheduledExecutor( + private var cleanerThreadPool = ThreadUtils.newDaemonSingleThreadScheduledExecutor( "failedShuffleCleanerThreadPool") } diff --git a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java index a745c9a72cc..508fdabddad 100644 --- a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java +++ b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java @@ -157,11 +157,11 @@ private void initializeLifecycleManager(String appId) { } if (lifecycleManager.conf().clientFetchCleanFailedShuffle()) { - lifecycleManager.registerGetShuffleIdForWriterCallback( - (celebornShuffleId, appShuffleIdentifier) -> + lifecycleManager.registerValidateCelebornShuffleIdForCleanCallback( + (appShuffleIdentifier) -> SparkUtils.addWriterShuffleIdsToBeCleaned( - lifecycleManager, celebornShuffleId, appShuffleIdentifier)); - lifecycleManager.registerGetShuffleIdForReaderCallback( + lifecycleManager, appShuffleIdentifier)); + lifecycleManager.registerRecordShuffleIdReferenceCallback( (celebornShuffleId, appShuffleIdentifier) -> SparkUtils.addShuffleIdRefCount( lifecycleManager, celebornShuffleId, appShuffleIdentifier)); @@ -179,14 +179,18 @@ private void initializeLifecycleManager(String appId) { shuffleId -> SparkUtils.invalidateSerializedGetReducerFileGroupResponse(shuffleId)); } if (lifecycleManager.conf().clientFetchCleanFailedShuffle()) { - lifecycleManager.registerGetShuffleIdForWriterCallback( - (celebornShuffleId, appShuffleIdentifier) -> + System.out.println("register callbacks"); + lifecycleManager.registerValidateCelebornShuffleIdForCleanCallback( + (appShuffleIdentifier) -> SparkUtils.addWriterShuffleIdsToBeCleaned( - lifecycleManager, celebornShuffleId, appShuffleIdentifier)); - lifecycleManager.registerGetShuffleIdForReaderCallback( + lifecycleManager, appShuffleIdentifier)); + lifecycleManager.registerRecordShuffleIdReferenceCallback( (celebornShuffleId, appShuffleIdentifier) -> SparkUtils.addShuffleIdRefCount( lifecycleManager, celebornShuffleId, appShuffleIdentifier)); + lifecycleManager.registerUnregisterShuffleCallback( + (celebornShuffleId) -> + SparkUtils.removeCleanedShuffleId(lifecycleManager, celebornShuffleId)); } } } diff --git a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java index 40862657641..18e7b3ec4b9 100644 --- a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java +++ b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java @@ -629,9 +629,9 @@ public static void invalidateSerializedGetReducerFileGroupResponse(Integer shuff } public static void addWriterShuffleIdsToBeCleaned( - LifecycleManager lifecycleManager, int celebornShuffeId, String appShuffleIdentifier) { + LifecycleManager lifecycleManager, String appShuffleIdentifier) { FailedShuffleCleaner.setLifecycleManager(lifecycleManager); - FailedShuffleCleaner.addShuffleIdToBeCleaned(celebornShuffeId, appShuffleIdentifier); + FailedShuffleCleaner.addShuffleIdToBeCleaned(appShuffleIdentifier); } public static void addShuffleIdRefCount( diff --git a/client/src/main/scala/org/apache/celeborn/client/LifecycleManager.scala b/client/src/main/scala/org/apache/celeborn/client/LifecycleManager.scala index 3e31f238a3d..d9436152652 100644 --- a/client/src/main/scala/org/apache/celeborn/client/LifecycleManager.scala +++ b/client/src/main/scala/org/apache/celeborn/client/LifecycleManager.scala @@ -939,7 +939,7 @@ class LifecycleManager(val appUniqueId: String, val conf: CelebornConf) extends val newShuffleId = shuffleIdGenerator.getAndIncrement() logInfo(s"generate new shuffleId $newShuffleId for appShuffleId $appShuffleId appShuffleIdentifier $appShuffleIdentifier") validateCelebornShuffleIdForClean.foreach(callback => - callback.accept(newShuffleId, appShuffleIdentifier)) + callback.accept(appShuffleIdentifier)) shuffleIds.put(appShuffleIdentifier, (newShuffleId, true)) newShuffleId } @@ -1857,14 +1857,15 @@ class LifecycleManager(val appUniqueId: String, val conf: CelebornConf) extends } // expecting celeborn shuffle id and application shuffle identifier - @volatile private var validateCelebornShuffleIdForClean: Option[BiConsumer[Integer, String]] = + @volatile private var validateCelebornShuffleIdForClean: Option[Consumer[String]] = None - def registerGetShuffleIdForWriterCallback(callback: BiConsumer[Integer, String]): Unit = { + def registerValidateCelebornShuffleIdForCleanCallback( + callback: Consumer[String]): Unit = { validateCelebornShuffleIdForClean = Some(callback) } // expecting celeborn shuffle id and application shuffle identifier @volatile private var recordShuffleIdReference: Option[BiConsumer[Integer, String]] = None - def registerGetShuffleIdForReaderCallback(callback: BiConsumer[Integer, String]): Unit = { + def registerRecordShuffleIdReferenceCallback(callback: BiConsumer[Integer, String]): Unit = { recordShuffleIdReference = Some(callback) } From d48e6a82907b0fa3e0414ade01becb0711d9e9db Mon Sep 17 00:00:00 2001 From: CodingCat Date: Sat, 5 Apr 2025 18:31:19 -0700 Subject: [PATCH 048/120] add debugging info2 --- .../scala/org/apache/celeborn/client/LifecycleManager.scala | 2 ++ .../spark/CelebornFetchFailureDiskCleanExpensiveSuite.scala | 2 ++ .../tests/spark/CelebornFetchFailureDiskCleanSuite.scala | 1 + 3 files changed, 5 insertions(+) diff --git a/client/src/main/scala/org/apache/celeborn/client/LifecycleManager.scala b/client/src/main/scala/org/apache/celeborn/client/LifecycleManager.scala index d9436152652..39eeed4835c 100644 --- a/client/src/main/scala/org/apache/celeborn/client/LifecycleManager.scala +++ b/client/src/main/scala/org/apache/celeborn/client/LifecycleManager.scala @@ -938,6 +938,8 @@ class LifecycleManager(val appUniqueId: String, val conf: CelebornConf) extends } val newShuffleId = shuffleIdGenerator.getAndIncrement() logInfo(s"generate new shuffleId $newShuffleId for appShuffleId $appShuffleId appShuffleIdentifier $appShuffleIdentifier") + println(s"generate new shuffleId $newShuffleId for appShuffleId $appShuffleId" + + s" appShuffleIdentifier $appShuffleIdentifier") validateCelebornShuffleIdForClean.foreach(callback => callback.accept(appShuffleIdentifier)) shuffleIds.put(appShuffleIdentifier, (newShuffleId, true)) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanExpensiveSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanExpensiveSuite.scala index 6c20366409f..e00077e91f1 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanExpensiveSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanExpensiveSuite.scala @@ -48,7 +48,9 @@ class CelebornFetchFailureDiskCleanExpensiveSuite extends FetchFailureDiskCleanB .withColumnRenamed("count", "df2_count") val df = df1.hint("merge").join(df2, "countId").select("*").persist() val tuples = df.collect() + println("first job done") df.collect() + println("second job done") checkStorageValidation(checkingThread, timeout = 600 * 1000) // verify result assert(hook.executed.get()) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala index 823f9784ac5..441992c1a78 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala @@ -200,6 +200,7 @@ class CelebornFetchFailureDiskCleanSuite extends FetchFailureDiskCleanBase { sparkSession.stop() } } + test("celeborn spark integration test - clean up the shuffle files if" + " the referring stage has finished") { if (Spark3OrNewer) { From d5344da8507f8272b1ccc62bb988c998b7fc624e Mon Sep 17 00:00:00 2001 From: CodingCat Date: Sat, 5 Apr 2025 19:54:29 -0700 Subject: [PATCH 049/120] remove flaky test --- .../shuffle/celeborn/SparkShuffleManager.java | 1 - .../celeborn/client/LifecycleManager.scala | 2 - ...nFetchFailureDiskCleanExpensiveSuite.scala | 63 ------------------- .../fetch_failure/FetchFailureTestBase.scala | 9 ++- .../fetch_failure/ShuffleReaderGetHooks.scala | 4 +- 5 files changed, 10 insertions(+), 69 deletions(-) delete mode 100644 tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanExpensiveSuite.scala diff --git a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java index 508fdabddad..7bc733f0f16 100644 --- a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java +++ b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java @@ -179,7 +179,6 @@ private void initializeLifecycleManager(String appId) { shuffleId -> SparkUtils.invalidateSerializedGetReducerFileGroupResponse(shuffleId)); } if (lifecycleManager.conf().clientFetchCleanFailedShuffle()) { - System.out.println("register callbacks"); lifecycleManager.registerValidateCelebornShuffleIdForCleanCallback( (appShuffleIdentifier) -> SparkUtils.addWriterShuffleIdsToBeCleaned( diff --git a/client/src/main/scala/org/apache/celeborn/client/LifecycleManager.scala b/client/src/main/scala/org/apache/celeborn/client/LifecycleManager.scala index 39eeed4835c..d9436152652 100644 --- a/client/src/main/scala/org/apache/celeborn/client/LifecycleManager.scala +++ b/client/src/main/scala/org/apache/celeborn/client/LifecycleManager.scala @@ -938,8 +938,6 @@ class LifecycleManager(val appUniqueId: String, val conf: CelebornConf) extends } val newShuffleId = shuffleIdGenerator.getAndIncrement() logInfo(s"generate new shuffleId $newShuffleId for appShuffleId $appShuffleId appShuffleIdentifier $appShuffleIdentifier") - println(s"generate new shuffleId $newShuffleId for appShuffleId $appShuffleId" + - s" appShuffleIdentifier $appShuffleIdentifier") validateCelebornShuffleIdForClean.foreach(callback => callback.accept(appShuffleIdentifier)) shuffleIds.put(appShuffleIdentifier, (newShuffleId, true)) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanExpensiveSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanExpensiveSuite.scala deleted file mode 100644 index e00077e91f1..00000000000 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanExpensiveSuite.scala +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.celeborn.tests.spark - -import org.apache.spark.shuffle.celeborn.{SparkUtils, TestCelebornShuffleManager} - -import org.apache.celeborn.tests.spark.fetch_failure.{FetchFailureDiskCleanBase, FileDeletionShuffleReaderGetHook} - -class CelebornFetchFailureDiskCleanExpensiveSuite extends FetchFailureDiskCleanBase { - - // 6. for multiple level M - 1 lineage , all failed disk spaces are cleaned - test("celeborn spark integration test - (M-1 dep with multi-level lineage) the failed shuffle files are all cleaned up" + - " correctly") { - if (Spark3OrNewer) { - val sparkSession = createSparkSession(enableFailedShuffleCleaner = true) - val celebornConf = SparkUtils.fromSparkConf(sparkSession.sparkContext.getConf) - val hook = new FileDeletionShuffleReaderGetHook( - celebornConf, - workerDirs, - shuffleIdToBeDeleted = Seq(0, 1, 2, 3), - triggerStageId = Some(4)) - TestCelebornShuffleManager.registerReaderGetHook(hook) - val checkingThread = triggerStorageCheckThread( - Seq(0, 1, 2, 3), - Seq(4, 5, 6, 7), - sparkSession, - forStableStatusChecking = false) - import sparkSession.implicits._ - val df1 = Seq((1, "a"), (2, "b")).toDF("id", "data").groupBy("id").count() - .withColumnRenamed("count", "countId").groupBy("countId").count() - .withColumnRenamed("count", "df1_count") - val df2 = Seq((2, "c"), (3, "d")).toDF("id", "data").groupBy("id").count() - .withColumnRenamed("count", "countId").groupBy("countId").count() - .withColumnRenamed("count", "df2_count") - val df = df1.hint("merge").join(df2, "countId").select("*").persist() - val tuples = df.collect() - println("first job done") - df.collect() - println("second job done") - checkStorageValidation(checkingThread, timeout = 600 * 1000) - // verify result - assert(hook.executed.get()) - val expect = "[1,2,2]" - assert(tuples.head.toString().equals(expect)) - sparkSession.stop() - } - } - -} diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureTestBase.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureTestBase.scala index 42398cce495..dd76ba66e65 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureTestBase.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureTestBase.scala @@ -28,8 +28,13 @@ private[tests] trait FetchFailureTestBase extends SparkTestBase { def createSparkSession( overrideShuffleMgr: Boolean = true, enableFailedShuffleCleaner: Boolean = false): SparkSession = { - val sparkConf = new SparkConf().setAppName("rss-demo").setMaster("local[2,3]") - + val sparkConf = new SparkConf().setAppName({ + if (!enableFailedShuffleCleaner) { + "fetch-failure" + } else { + "fetch-failure-failed-shuffle-clean" + } + }).setMaster("local[2,3]") var baseBuilder = SparkSession.builder() .config(updateSparkConf(sparkConf, ShuffleMode.HASH)) .config("spark.sql.shuffle.partitions", 2) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/ShuffleReaderGetHooks.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/ShuffleReaderGetHooks.scala index d60d6487f67..714ba838fa1 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/ShuffleReaderGetHooks.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/ShuffleReaderGetHooks.scala @@ -97,7 +97,9 @@ class FileDeletionShuffleReaderGetHook( startPartition: Int, endPartition: Int, context: TaskContext): Unit = { - if (executed.get()) return + if (executed.get()) { + return + } lock.synchronized { handle match { case h: CelebornShuffleHandle[_, _, _] => { From b76fbfb0e94bc58532f5acb656562b1b0350ff91 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Mon, 14 Apr 2025 19:38:32 -0400 Subject: [PATCH 050/120] addr comments --- .../shuffle/celeborn/SparkShuffleManager.java | 21 ++++++---------- .../spark/shuffle/celeborn/SparkUtils.java | 16 ++++++++----- .../celeborn/spark/FailedShuffleCleaner.scala | 24 ++++++++++++------- .../org/apache/spark/SparkContextHelper.scala | 0 .../spark/scheduler/RunningStageManager.scala | 0 .../CelebornFetchFailureDiskCleanSuite.scala | 3 +-- .../FetchFailureDiskCleanBase.scala | 2 +- .../fetch_failure/ShuffleReaderGetHooks.scala | 2 +- 8 files changed, 35 insertions(+), 33 deletions(-) rename client-spark/{common => spark-3}/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala (94%) rename client-spark/{common => spark-3}/src/main/scala/org/apache/spark/SparkContextHelper.scala (100%) rename client-spark/{common => spark-3}/src/main/scala/org/apache/spark/scheduler/RunningStageManager.scala (100%) diff --git a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java index 7bc733f0f16..72139423c55 100644 --- a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java +++ b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java @@ -157,13 +157,19 @@ private void initializeLifecycleManager(String appId) { } if (lifecycleManager.conf().clientFetchCleanFailedShuffle()) { + if (!lifecycleManager.conf().clientStageRerunEnabled()) { + throw new IllegalArgumentException( + CelebornConf.CLIENT_STAGE_RERUN_ENABLED().key() + " has to be " + + "enabled, when " + + CelebornConf.CLIENT_FETCH_CLEAN_FAILED_SHUFFLE().key() + " is set to true"); + } lifecycleManager.registerValidateCelebornShuffleIdForCleanCallback( (appShuffleIdentifier) -> SparkUtils.addWriterShuffleIdsToBeCleaned( lifecycleManager, appShuffleIdentifier)); lifecycleManager.registerRecordShuffleIdReferenceCallback( (celebornShuffleId, appShuffleIdentifier) -> - SparkUtils.addShuffleIdRefCount( + SparkUtils.addShuffleIdRefStage( lifecycleManager, celebornShuffleId, appShuffleIdentifier)); lifecycleManager.registerUnregisterShuffleCallback( (celebornShuffleId) -> @@ -178,19 +184,6 @@ private void initializeLifecycleManager(String appId) { lifecycleManager.registerInvalidatedBroadcastCallback( shuffleId -> SparkUtils.invalidateSerializedGetReducerFileGroupResponse(shuffleId)); } - if (lifecycleManager.conf().clientFetchCleanFailedShuffle()) { - lifecycleManager.registerValidateCelebornShuffleIdForCleanCallback( - (appShuffleIdentifier) -> - SparkUtils.addWriterShuffleIdsToBeCleaned( - lifecycleManager, appShuffleIdentifier)); - lifecycleManager.registerRecordShuffleIdReferenceCallback( - (celebornShuffleId, appShuffleIdentifier) -> - SparkUtils.addShuffleIdRefCount( - lifecycleManager, celebornShuffleId, appShuffleIdentifier)); - lifecycleManager.registerUnregisterShuffleCallback( - (celebornShuffleId) -> - SparkUtils.removeCleanedShuffleId(lifecycleManager, celebornShuffleId)); - } } } } diff --git a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java index 18e7b3ec4b9..7e3c869312f 100644 --- a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java +++ b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java @@ -130,17 +130,21 @@ public static String appUniqueId(SparkContext context) { .getOrElse(context::applicationId); } - public static String getAppShuffleIdentifier(int appShuffleId, TaskContext context) { + public static String encodeAppShuffleIdentifier(int appShuffleId, TaskContext context) { return appShuffleId + "-" + context.stageId() + "-" + context.stageAttemptNumber(); } + public static String[] decodeAppShuffleIdentifier(String appShuffleIdentifier) { + return appShuffleIdentifier.split("-"); + } + public static int celebornShuffleId( ShuffleClient client, CelebornShuffleHandle handle, TaskContext context, Boolean isWriter) { if (handle.throwsFetchFailure()) { - String appShuffleIdentifier = getAppShuffleIdentifier(handle.shuffleId(), context); + String appShuffleIdentifier = encodeAppShuffleIdentifier(handle.shuffleId(), context); Tuple2 res = client.getShuffleId( handle.shuffleId(), @@ -329,7 +333,7 @@ public static void addFailureListenerIfBarrierTask( if (!(taskContext instanceof BarrierTaskContext)) return; int appShuffleId = handle.shuffleId(); - String appShuffleIdentifier = SparkUtils.getAppShuffleIdentifier(appShuffleId, taskContext); + String appShuffleIdentifier = SparkUtils.encodeAppShuffleIdentifier(appShuffleId, taskContext); BarrierTaskContext barrierContext = (BarrierTaskContext) taskContext; barrierContext.addTaskFailureListener( @@ -634,10 +638,10 @@ public static void addWriterShuffleIdsToBeCleaned( FailedShuffleCleaner.addShuffleIdToBeCleaned(appShuffleIdentifier); } - public static void addShuffleIdRefCount( - LifecycleManager lifecycleManager, int celebornShuffeId, String appShuffleIdentifier) { + public static void addShuffleIdRefStage( + LifecycleManager lifecycleManager, int celebornShuffleId, String appShuffleIdentifier) { FailedShuffleCleaner.setLifecycleManager(lifecycleManager); - FailedShuffleCleaner.addShuffleIdReferringStage(celebornShuffeId, appShuffleIdentifier); + FailedShuffleCleaner.addShuffleIdReferringStage(celebornShuffleId, appShuffleIdentifier); } public static void removeCleanedShuffleId( diff --git a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala b/client-spark/spark-3/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala similarity index 94% rename from client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala rename to client-spark/spark-3/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala index e8f674d8f95..c6139548f1f 100644 --- a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala +++ b/client-spark/spark-3/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala @@ -15,19 +15,21 @@ * limitations under the License. */ package org.apache.celeborn.spark + import java.util -import java.util.concurrent.{ConcurrentHashMap, LinkedBlockingQueue, TimeUnit} import java.util.concurrent.atomic.AtomicReference +import java.util.concurrent.{ConcurrentHashMap, LinkedBlockingQueue, TimeUnit} import scala.collection.JavaConverters._ import scala.collection.mutable -import org.apache.spark.scheduler.{RunningStageManager, RunningStageManagerImpl} - import org.apache.celeborn.client.LifecycleManager import org.apache.celeborn.common.internal.Logging import org.apache.celeborn.common.util.ThreadUtils +import org.apache.spark.scheduler.{RunningStageManager, RunningStageManagerImpl} +import org.apache.spark.shuffle.celeborn.SparkUtils + private[celeborn] object FailedShuffleCleaner extends Logging { private val lifecycleManager = new AtomicReference[LifecycleManager](null) @@ -38,10 +40,11 @@ private[celeborn] object FailedShuffleCleaner extends Logging { private[celeborn] val celebornShuffleIdToReferringStages = new ConcurrentHashMap[Int, mutable.HashSet[Int]]() + private val lock = new Object + private lazy val cleanInterval = lifecycleManager.get().conf.clientFetchCleanFailedShuffleIntervalMS - private val lock = new Object val RUNNING_STAGE_CHECKER_CLASS = "CELEBORN_TEST_RUNNING_STAGE_CHECKER_IMPL" private[celeborn] var runningStageManager: RunningStageManager = buildRunningStageChecker() @@ -71,14 +74,16 @@ private[celeborn] object FailedShuffleCleaner extends Logging { def addShuffleIdReferringStage(celebornShuffleId: Int, appShuffleIdentifier: String): Unit = { // this is only implemented/tested with Spark for now - val Array(_, stageId, _) = appShuffleIdentifier.split('-') - celebornShuffleIdToReferringStages.putIfAbsent(celebornShuffleId, new mutable.HashSet[Int]()) + val Array(_, stageId, _) = SparkUtils.decodeAppShuffleIdentifier(appShuffleIdentifier) + celebornShuffleIdToReferringStages.putIfAbsent(celebornShuffleId, + new mutable.HashSet[Int]) lock.synchronized { celebornShuffleIdToReferringStages.get(celebornShuffleId).add(stageId.toInt) } } - private def onlyCurrentStageReferred(celebornShuffleId: Int, stageId: Int): Boolean = { + private def onlyCurrentStageReferred(celebornShuffleId: Int, stageId: Int): Boolean = + lock.synchronized { val ret = celebornShuffleIdToReferringStages.get(celebornShuffleId).size == 1 && celebornShuffleIdToReferringStages.get(celebornShuffleId).contains(stageId) if (ret) { @@ -88,7 +93,8 @@ private[celeborn] object FailedShuffleCleaner extends Logging { } def addShuffleIdToBeCleaned(appShuffleIdentifier: String): Unit = { - val Array(appShuffleId, stageId, _) = appShuffleIdentifier.split('-') + val Array(appShuffleId, stageId, _) = SparkUtils.decodeAppShuffleIdentifier( + appShuffleIdentifier) lifecycleManager.get().getShuffleIdMapping.get(appShuffleId.toInt).foreach { case (_, (celebornShuffleId, _)) => { if (!celebornShuffleIdToReferringStages.containsKey(celebornShuffleId) @@ -138,7 +144,7 @@ private[celeborn] object FailedShuffleCleaner extends Logging { cleanedShuffleIds.remove(celebornShuffleId) } - private def noRunningDownstreamStage(celebornShuffleId: Int): Boolean = { + private def noRunningDownstreamStage(celebornShuffleId: Int): Boolean = lock.synchronized { val allReferringStageIds = celebornShuffleIdToReferringStages.get(celebornShuffleId) require(allReferringStageIds != null, s"no stage referring to shuffle $celebornShuffleId") val ret = diff --git a/client-spark/common/src/main/scala/org/apache/spark/SparkContextHelper.scala b/client-spark/spark-3/src/main/scala/org/apache/spark/SparkContextHelper.scala similarity index 100% rename from client-spark/common/src/main/scala/org/apache/spark/SparkContextHelper.scala rename to client-spark/spark-3/src/main/scala/org/apache/spark/SparkContextHelper.scala diff --git a/client-spark/common/src/main/scala/org/apache/spark/scheduler/RunningStageManager.scala b/client-spark/spark-3/src/main/scala/org/apache/spark/scheduler/RunningStageManager.scala similarity index 100% rename from client-spark/common/src/main/scala/org/apache/spark/scheduler/RunningStageManager.scala rename to client-spark/spark-3/src/main/scala/org/apache/spark/scheduler/RunningStageManager.scala diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala index 441992c1a78..4755d50ac11 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala @@ -19,9 +19,8 @@ package org.apache.celeborn.tests.spark import scala.collection.mutable import org.apache.spark.shuffle.celeborn.{SparkUtils, TestCelebornShuffleManager} - -import org.apache.celeborn.spark.FailedShuffleCleaner import org.apache.celeborn.tests.spark.fetch_failure.{FailCommitShuffleReaderGetHook, FetchFailureDiskCleanBase, FileDeletionShuffleReaderGetHook, TestRunningStageManager} +import org.apache.spark.celeborn.spark.FailedShuffleCleaner class CelebornFetchFailureDiskCleanSuite extends FetchFailureDiskCleanBase { diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala index 846216087bd..04ac48874b3 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala @@ -25,7 +25,7 @@ import org.scalatest.funsuite.AnyFunSuite import org.apache.celeborn.client.ShuffleClient import org.apache.celeborn.service.deploy.worker.Worker -import org.apache.celeborn.spark.FailedShuffleCleaner +import org.apache.spark.celeborn.spark.FailedShuffleCleaner private[tests] trait FetchFailureDiskCleanBase extends AnyFunSuite with FetchFailureTestBase diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/ShuffleReaderGetHooks.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/ShuffleReaderGetHooks.scala index 714ba838fa1..35e3d2561b9 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/ShuffleReaderGetHooks.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/ShuffleReaderGetHooks.scala @@ -112,7 +112,7 @@ class FileDeletionShuffleReaderGetHook( h.userIdentifier, h.extension) val celebornShuffleId = SparkUtils.celebornShuffleId(shuffleClient, h, context, false) - val appShuffleIdentifier = SparkUtils.getAppShuffleIdentifier(handle.shuffleId, context) + val appShuffleIdentifier = SparkUtils.encodeAppShuffleIdentifier(handle.shuffleId, context) val Array(_, stageId, _) = appShuffleIdentifier.split('-') if (triggerStageId.isEmpty || triggerStageId.get == stageId.toInt) { if (shuffleIdToBeDeleted.isEmpty) { From 61bae501210cd24a6b1cac754a4f99597ec3703e Mon Sep 17 00:00:00 2001 From: CodingCat Date: Mon, 14 Apr 2025 19:45:24 -0400 Subject: [PATCH 051/120] fix compile --- .../shuffle/celeborn/SparkShuffleManager.java | 8 ++++--- .../celeborn/spark/FailedShuffleCleaner.scala | 23 +++++++++---------- .../CelebornFetchFailureDiskCleanSuite.scala | 3 ++- .../FetchFailureDiskCleanBase.scala | 2 +- .../fetch_failure/ShuffleReaderGetHooks.scala | 3 ++- 5 files changed, 21 insertions(+), 18 deletions(-) diff --git a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java index 72139423c55..45e74b2efa4 100644 --- a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java +++ b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java @@ -159,9 +159,11 @@ private void initializeLifecycleManager(String appId) { if (lifecycleManager.conf().clientFetchCleanFailedShuffle()) { if (!lifecycleManager.conf().clientStageRerunEnabled()) { throw new IllegalArgumentException( - CelebornConf.CLIENT_STAGE_RERUN_ENABLED().key() + " has to be " - + "enabled, when " + - CelebornConf.CLIENT_FETCH_CLEAN_FAILED_SHUFFLE().key() + " is set to true"); + CelebornConf.CLIENT_STAGE_RERUN_ENABLED().key() + + " has to be " + + "enabled, when " + + CelebornConf.CLIENT_FETCH_CLEAN_FAILED_SHUFFLE().key() + + " is set to true"); } lifecycleManager.registerValidateCelebornShuffleIdForCleanCallback( (appShuffleIdentifier) -> diff --git a/client-spark/spark-3/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala b/client-spark/spark-3/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala index c6139548f1f..915d8374f48 100644 --- a/client-spark/spark-3/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala +++ b/client-spark/spark-3/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala @@ -17,19 +17,19 @@ package org.apache.celeborn.spark import java.util -import java.util.concurrent.atomic.AtomicReference import java.util.concurrent.{ConcurrentHashMap, LinkedBlockingQueue, TimeUnit} +import java.util.concurrent.atomic.AtomicReference import scala.collection.JavaConverters._ import scala.collection.mutable +import org.apache.spark.scheduler.{RunningStageManager, RunningStageManagerImpl} +import org.apache.spark.shuffle.celeborn.SparkUtils + import org.apache.celeborn.client.LifecycleManager import org.apache.celeborn.common.internal.Logging import org.apache.celeborn.common.util.ThreadUtils -import org.apache.spark.scheduler.{RunningStageManager, RunningStageManagerImpl} -import org.apache.spark.shuffle.celeborn.SparkUtils - private[celeborn] object FailedShuffleCleaner extends Logging { private val lifecycleManager = new AtomicReference[LifecycleManager](null) @@ -75,8 +75,7 @@ private[celeborn] object FailedShuffleCleaner extends Logging { def addShuffleIdReferringStage(celebornShuffleId: Int, appShuffleIdentifier: String): Unit = { // this is only implemented/tested with Spark for now val Array(_, stageId, _) = SparkUtils.decodeAppShuffleIdentifier(appShuffleIdentifier) - celebornShuffleIdToReferringStages.putIfAbsent(celebornShuffleId, - new mutable.HashSet[Int]) + celebornShuffleIdToReferringStages.putIfAbsent(celebornShuffleId, new mutable.HashSet[Int]) lock.synchronized { celebornShuffleIdToReferringStages.get(celebornShuffleId).add(stageId.toInt) } @@ -84,13 +83,13 @@ private[celeborn] object FailedShuffleCleaner extends Logging { private def onlyCurrentStageReferred(celebornShuffleId: Int, stageId: Int): Boolean = lock.synchronized { - val ret = celebornShuffleIdToReferringStages.get(celebornShuffleId).size == 1 && - celebornShuffleIdToReferringStages.get(celebornShuffleId).contains(stageId) - if (ret) { - logInfo(s"only stage $stageId refers to shuffle $celebornShuffleId, adding for clean up") + val ret = celebornShuffleIdToReferringStages.get(celebornShuffleId).size == 1 && + celebornShuffleIdToReferringStages.get(celebornShuffleId).contains(stageId) + if (ret) { + logInfo(s"only stage $stageId refers to shuffle $celebornShuffleId, adding for clean up") + } + ret } - ret - } def addShuffleIdToBeCleaned(appShuffleIdentifier: String): Unit = { val Array(appShuffleId, stageId, _) = SparkUtils.decodeAppShuffleIdentifier( diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala index 4755d50ac11..441992c1a78 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala @@ -19,8 +19,9 @@ package org.apache.celeborn.tests.spark import scala.collection.mutable import org.apache.spark.shuffle.celeborn.{SparkUtils, TestCelebornShuffleManager} + +import org.apache.celeborn.spark.FailedShuffleCleaner import org.apache.celeborn.tests.spark.fetch_failure.{FailCommitShuffleReaderGetHook, FetchFailureDiskCleanBase, FileDeletionShuffleReaderGetHook, TestRunningStageManager} -import org.apache.spark.celeborn.spark.FailedShuffleCleaner class CelebornFetchFailureDiskCleanSuite extends FetchFailureDiskCleanBase { diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala index 04ac48874b3..846216087bd 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala @@ -25,7 +25,7 @@ import org.scalatest.funsuite.AnyFunSuite import org.apache.celeborn.client.ShuffleClient import org.apache.celeborn.service.deploy.worker.Worker -import org.apache.spark.celeborn.spark.FailedShuffleCleaner +import org.apache.celeborn.spark.FailedShuffleCleaner private[tests] trait FetchFailureDiskCleanBase extends AnyFunSuite with FetchFailureTestBase diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/ShuffleReaderGetHooks.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/ShuffleReaderGetHooks.scala index 35e3d2561b9..a196e740c2b 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/ShuffleReaderGetHooks.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/ShuffleReaderGetHooks.scala @@ -112,7 +112,8 @@ class FileDeletionShuffleReaderGetHook( h.userIdentifier, h.extension) val celebornShuffleId = SparkUtils.celebornShuffleId(shuffleClient, h, context, false) - val appShuffleIdentifier = SparkUtils.encodeAppShuffleIdentifier(handle.shuffleId, context) + val appShuffleIdentifier = + SparkUtils.encodeAppShuffleIdentifier(handle.shuffleId, context) val Array(_, stageId, _) = appShuffleIdentifier.split('-') if (triggerStageId.isEmpty || triggerStageId.get == stageId.toInt) { if (shuffleIdToBeDeleted.isEmpty) { From 8a5692bb5683df5b833ce3f7c9d10021a69a9df5 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Mon, 14 Apr 2025 20:31:00 -0400 Subject: [PATCH 052/120] fix spark 2 compile --- .../org/apache/celeborn/spark/FailedShuffleCleaner.scala | 5 +---- .../src/main/java}/org/apache/spark/SparkContextHelper.scala | 0 .../org/apache/spark/scheduler/RunningStageManager.scala | 0 3 files changed, 1 insertion(+), 4 deletions(-) rename client-spark/{spark-3/src/main/scala => common/src/main/java}/org/apache/celeborn/spark/FailedShuffleCleaner.scala (97%) rename client-spark/{spark-3/src/main/scala => common/src/main/java}/org/apache/spark/SparkContextHelper.scala (100%) rename client-spark/{spark-3/src/main/scala => common/src/main/java}/org/apache/spark/scheduler/RunningStageManager.scala (100%) diff --git a/client-spark/spark-3/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala b/client-spark/common/src/main/java/org/apache/celeborn/spark/FailedShuffleCleaner.scala similarity index 97% rename from client-spark/spark-3/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala rename to client-spark/common/src/main/java/org/apache/celeborn/spark/FailedShuffleCleaner.scala index 915d8374f48..57b0b1dd305 100644 --- a/client-spark/spark-3/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala +++ b/client-spark/common/src/main/java/org/apache/celeborn/spark/FailedShuffleCleaner.scala @@ -17,15 +17,12 @@ package org.apache.celeborn.spark import java.util -import java.util.concurrent.{ConcurrentHashMap, LinkedBlockingQueue, TimeUnit} import java.util.concurrent.atomic.AtomicReference +import java.util.concurrent.{ConcurrentHashMap, LinkedBlockingQueue, TimeUnit} import scala.collection.JavaConverters._ import scala.collection.mutable -import org.apache.spark.scheduler.{RunningStageManager, RunningStageManagerImpl} -import org.apache.spark.shuffle.celeborn.SparkUtils - import org.apache.celeborn.client.LifecycleManager import org.apache.celeborn.common.internal.Logging import org.apache.celeborn.common.util.ThreadUtils diff --git a/client-spark/spark-3/src/main/scala/org/apache/spark/SparkContextHelper.scala b/client-spark/common/src/main/java/org/apache/spark/SparkContextHelper.scala similarity index 100% rename from client-spark/spark-3/src/main/scala/org/apache/spark/SparkContextHelper.scala rename to client-spark/common/src/main/java/org/apache/spark/SparkContextHelper.scala diff --git a/client-spark/spark-3/src/main/scala/org/apache/spark/scheduler/RunningStageManager.scala b/client-spark/common/src/main/java/org/apache/spark/scheduler/RunningStageManager.scala similarity index 100% rename from client-spark/spark-3/src/main/scala/org/apache/spark/scheduler/RunningStageManager.scala rename to client-spark/common/src/main/java/org/apache/spark/scheduler/RunningStageManager.scala From bbe9638b91272d2c8c30de2b76d017a1d43381c3 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Mon, 14 Apr 2025 20:45:09 -0400 Subject: [PATCH 053/120] fix build --- .../org/apache/celeborn/spark/FailedShuffleCleaner.scala | 9 +++++---- .../org/apache/spark/SparkContextHelper.scala | 0 .../org/apache/spark/scheduler/RunningStageManager.scala | 0 .../org/apache/spark/shuffle/celeborn/SparkUtils.java | 6 +----- .../spark/fetch_failure/ShuffleReaderGetHooks.scala | 2 +- 5 files changed, 7 insertions(+), 10 deletions(-) rename client-spark/common/src/main/{java => scala}/org/apache/celeborn/spark/FailedShuffleCleaner.scala (96%) rename client-spark/common/src/main/{java => scala}/org/apache/spark/SparkContextHelper.scala (100%) rename client-spark/common/src/main/{java => scala}/org/apache/spark/scheduler/RunningStageManager.scala (100%) diff --git a/client-spark/common/src/main/java/org/apache/celeborn/spark/FailedShuffleCleaner.scala b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala similarity index 96% rename from client-spark/common/src/main/java/org/apache/celeborn/spark/FailedShuffleCleaner.scala rename to client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala index 57b0b1dd305..6e961389bdf 100644 --- a/client-spark/common/src/main/java/org/apache/celeborn/spark/FailedShuffleCleaner.scala +++ b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala @@ -17,12 +17,14 @@ package org.apache.celeborn.spark import java.util -import java.util.concurrent.atomic.AtomicReference import java.util.concurrent.{ConcurrentHashMap, LinkedBlockingQueue, TimeUnit} +import java.util.concurrent.atomic.AtomicReference import scala.collection.JavaConverters._ import scala.collection.mutable +import org.apache.spark.scheduler.{RunningStageManager, RunningStageManagerImpl} + import org.apache.celeborn.client.LifecycleManager import org.apache.celeborn.common.internal.Logging import org.apache.celeborn.common.util.ThreadUtils @@ -71,7 +73,7 @@ private[celeborn] object FailedShuffleCleaner extends Logging { def addShuffleIdReferringStage(celebornShuffleId: Int, appShuffleIdentifier: String): Unit = { // this is only implemented/tested with Spark for now - val Array(_, stageId, _) = SparkUtils.decodeAppShuffleIdentifier(appShuffleIdentifier) + val Array(_, stageId, _) = appShuffleIdentifier.split("-"); celebornShuffleIdToReferringStages.putIfAbsent(celebornShuffleId, new mutable.HashSet[Int]) lock.synchronized { celebornShuffleIdToReferringStages.get(celebornShuffleId).add(stageId.toInt) @@ -89,8 +91,7 @@ private[celeborn] object FailedShuffleCleaner extends Logging { } def addShuffleIdToBeCleaned(appShuffleIdentifier: String): Unit = { - val Array(appShuffleId, stageId, _) = SparkUtils.decodeAppShuffleIdentifier( - appShuffleIdentifier) + val Array(appShuffleId, stageId, _) = appShuffleIdentifier.split("-"); lifecycleManager.get().getShuffleIdMapping.get(appShuffleId.toInt).foreach { case (_, (celebornShuffleId, _)) => { if (!celebornShuffleIdToReferringStages.containsKey(celebornShuffleId) diff --git a/client-spark/common/src/main/java/org/apache/spark/SparkContextHelper.scala b/client-spark/common/src/main/scala/org/apache/spark/SparkContextHelper.scala similarity index 100% rename from client-spark/common/src/main/java/org/apache/spark/SparkContextHelper.scala rename to client-spark/common/src/main/scala/org/apache/spark/SparkContextHelper.scala diff --git a/client-spark/common/src/main/java/org/apache/spark/scheduler/RunningStageManager.scala b/client-spark/common/src/main/scala/org/apache/spark/scheduler/RunningStageManager.scala similarity index 100% rename from client-spark/common/src/main/java/org/apache/spark/scheduler/RunningStageManager.scala rename to client-spark/common/src/main/scala/org/apache/spark/scheduler/RunningStageManager.scala diff --git a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java index 7e3c869312f..f822c4b9262 100644 --- a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java +++ b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java @@ -134,10 +134,6 @@ public static String encodeAppShuffleIdentifier(int appShuffleId, TaskContext co return appShuffleId + "-" + context.stageId() + "-" + context.stageAttemptNumber(); } - public static String[] decodeAppShuffleIdentifier(String appShuffleIdentifier) { - return appShuffleIdentifier.split("-"); - } - public static int celebornShuffleId( ShuffleClient client, CelebornShuffleHandle handle, @@ -333,7 +329,7 @@ public static void addFailureListenerIfBarrierTask( if (!(taskContext instanceof BarrierTaskContext)) return; int appShuffleId = handle.shuffleId(); - String appShuffleIdentifier = SparkUtils.encodeAppShuffleIdentifier(appShuffleId, taskContext); + String appShuffleIdentifier = encodeAppShuffleIdentifier(appShuffleId, taskContext); BarrierTaskContext barrierContext = (BarrierTaskContext) taskContext; barrierContext.addTaskFailureListener( diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/ShuffleReaderGetHooks.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/ShuffleReaderGetHooks.scala index a196e740c2b..9db9f4c94b9 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/ShuffleReaderGetHooks.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/ShuffleReaderGetHooks.scala @@ -113,7 +113,7 @@ class FileDeletionShuffleReaderGetHook( h.extension) val celebornShuffleId = SparkUtils.celebornShuffleId(shuffleClient, h, context, false) val appShuffleIdentifier = - SparkUtils.encodeAppShuffleIdentifier(handle.shuffleId, context) + SparkUtils.getAppShuffleIdentifier(handle.shuffleId, context) val Array(_, stageId, _) = appShuffleIdentifier.split('-') if (triggerStageId.isEmpty || triggerStageId.get == stageId.toInt) { if (shuffleIdToBeDeleted.isEmpty) { From e35f9961d48d247f476ea469658f438e30cf76c6 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Mon, 21 Apr 2025 10:01:47 -0700 Subject: [PATCH 054/120] refactor encode/decode app identifier and remove runningstagemanagers --- .../shuffle/celeborn/SparkCommonUtils.java | 8 ++++++ .../celeborn/spark/FailedShuffleCleaner.scala | 11 ++++---- .../{scheduler => }/RunningStageManager.scala | 25 ++++++++++++++++--- .../shuffle/celeborn/SparkShuffleManager.java | 4 +++ .../spark/shuffle/celeborn/SparkUtils.java | 10 +++----- .../fetch_failure/ShuffleReaderGetHooks.scala | 5 ++-- .../TestRunningStageManager.scala | 9 +++---- 7 files changed, 49 insertions(+), 23 deletions(-) rename client-spark/common/src/main/scala/org/apache/spark/{scheduler => }/RunningStageManager.scala (58%) diff --git a/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/SparkCommonUtils.java b/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/SparkCommonUtils.java index a24e06d5a68..84d74f8c145 100644 --- a/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/SparkCommonUtils.java +++ b/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/SparkCommonUtils.java @@ -52,6 +52,14 @@ public static void validateAttemptConfig(SparkConf conf) throws IllegalArgumentE } } + public static String encodeAppShuffleIdentifier(int appShuffleId, TaskContext context) { + return appShuffleId + "-" + context.stageId() + "-" + context.stageAttemptNumber(); + } + + public static String[] decodeAppShuffleIdentifier(String appShuffleIdentifier) { + return appShuffleIdentifier.split("-"); + } + public static int getEncodedAttemptNumber(TaskContext context) { return (context.stageAttemptNumber() << 16) | context.attemptNumber(); } diff --git a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala index 6e961389bdf..a55d222729b 100644 --- a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala +++ b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala @@ -23,12 +23,13 @@ import java.util.concurrent.atomic.AtomicReference import scala.collection.JavaConverters._ import scala.collection.mutable -import org.apache.spark.scheduler.{RunningStageManager, RunningStageManagerImpl} - +import org.apache.spark.{RunningStageManager, RunningStageManagerImpl} import org.apache.celeborn.client.LifecycleManager import org.apache.celeborn.common.internal.Logging import org.apache.celeborn.common.util.ThreadUtils +import org.apache.spark.shuffle.celeborn.SparkCommonUtils + private[celeborn] object FailedShuffleCleaner extends Logging { private val lifecycleManager = new AtomicReference[LifecycleManager](null) @@ -72,8 +73,7 @@ private[celeborn] object FailedShuffleCleaner extends Logging { } def addShuffleIdReferringStage(celebornShuffleId: Int, appShuffleIdentifier: String): Unit = { - // this is only implemented/tested with Spark for now - val Array(_, stageId, _) = appShuffleIdentifier.split("-"); + val Array(_, stageId, _) = SparkCommonUtils.decodeAppShuffleIdentifier(appShuffleIdentifier) celebornShuffleIdToReferringStages.putIfAbsent(celebornShuffleId, new mutable.HashSet[Int]) lock.synchronized { celebornShuffleIdToReferringStages.get(celebornShuffleId).add(stageId.toInt) @@ -91,7 +91,8 @@ private[celeborn] object FailedShuffleCleaner extends Logging { } def addShuffleIdToBeCleaned(appShuffleIdentifier: String): Unit = { - val Array(appShuffleId, stageId, _) = appShuffleIdentifier.split("-"); + val Array(appShuffleId, stageId, _) = SparkCommonUtils.decodeAppShuffleIdentifier( + appShuffleIdentifier) lifecycleManager.get().getShuffleIdMapping.get(appShuffleId.toInt).foreach { case (_, (celebornShuffleId, _)) => { if (!celebornShuffleIdToReferringStages.containsKey(celebornShuffleId) diff --git a/client-spark/common/src/main/scala/org/apache/spark/scheduler/RunningStageManager.scala b/client-spark/common/src/main/scala/org/apache/spark/RunningStageManager.scala similarity index 58% rename from client-spark/common/src/main/scala/org/apache/spark/scheduler/RunningStageManager.scala rename to client-spark/common/src/main/scala/org/apache/spark/RunningStageManager.scala index daa9688c4e9..6ac5167d3c0 100644 --- a/client-spark/common/src/main/scala/org/apache/spark/scheduler/RunningStageManager.scala +++ b/client-spark/common/src/main/scala/org/apache/spark/RunningStageManager.scala @@ -14,17 +14,34 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.spark.scheduler +package org.apache.spark -import org.apache.spark.SparkContext +import scala.collection.mutable trait RunningStageManager { def isRunningStage(stageId: Int): Boolean } class RunningStageManagerImpl extends RunningStageManager { - private def dagScheduler = SparkContext.getActive.get.dagScheduler + + private val stageClass = Class.forName("org.apache.spark.scheduler.Stage") + + private val idField = stageClass.getDeclaredField("id") + idField.setAccessible(true) + + + private def runningStages: mutable.HashSet[_] = { + val dagSchedulerClz = SparkContext.getActive.get.dagScheduler.getClass + val runningStagesField = dagSchedulerClz.getDeclaredField("runningStages") + runningStagesField.setAccessible(true) + runningStagesField.get(SparkContext.getActive.get.dagScheduler) + .asInstanceOf[mutable.HashSet[_]] + } + override def isRunningStage(stageId: Int): Boolean = { - dagScheduler.runningStages.map(_.id).contains(stageId) + runningStages.map { stage => + val stageId = idField.get(stage).asInstanceOf[Int] + stageId + }.contains(stageId) } } diff --git a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java index 45e74b2efa4..821b8d13a43 100644 --- a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java +++ b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java @@ -21,6 +21,7 @@ import java.util.Objects; import java.util.concurrent.ConcurrentHashMap; +import org.apache.celeborn.spark.FailedShuffleCleaner; import org.apache.spark.*; import org.apache.spark.internal.config.package$; import org.apache.spark.launcher.SparkLauncher; @@ -268,6 +269,9 @@ public void stop() { _sortShuffleManager.stop(); _sortShuffleManager = null; } + if (celebornConf.clientFetchCleanFailedShuffle()) { + FailedShuffleCleaner.reset(); + } } @Override diff --git a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java index f822c4b9262..b485493a952 100644 --- a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java +++ b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java @@ -130,17 +130,14 @@ public static String appUniqueId(SparkContext context) { .getOrElse(context::applicationId); } - public static String encodeAppShuffleIdentifier(int appShuffleId, TaskContext context) { - return appShuffleId + "-" + context.stageId() + "-" + context.stageAttemptNumber(); - } - public static int celebornShuffleId( ShuffleClient client, CelebornShuffleHandle handle, TaskContext context, Boolean isWriter) { if (handle.throwsFetchFailure()) { - String appShuffleIdentifier = encodeAppShuffleIdentifier(handle.shuffleId(), context); + String appShuffleIdentifier = SparkCommonUtils.encodeAppShuffleIdentifier(handle.shuffleId(), + context); Tuple2 res = client.getShuffleId( handle.shuffleId(), @@ -329,7 +326,8 @@ public static void addFailureListenerIfBarrierTask( if (!(taskContext instanceof BarrierTaskContext)) return; int appShuffleId = handle.shuffleId(); - String appShuffleIdentifier = encodeAppShuffleIdentifier(appShuffleId, taskContext); + String appShuffleIdentifier = SparkCommonUtils.encodeAppShuffleIdentifier( + appShuffleId, taskContext); BarrierTaskContext barrierContext = (BarrierTaskContext) taskContext; barrierContext.addTaskFailureListener( diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/ShuffleReaderGetHooks.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/ShuffleReaderGetHooks.scala index 9db9f4c94b9..fb5f43f0e82 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/ShuffleReaderGetHooks.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/ShuffleReaderGetHooks.scala @@ -22,8 +22,7 @@ import java.util.concurrent.atomic.AtomicBoolean import org.apache.spark.{SparkEnv, TaskContext} import org.apache.spark.shuffle.ShuffleHandle -import org.apache.spark.shuffle.celeborn.{CelebornShuffleHandle, ShuffleManagerHook, SparkShuffleManager, SparkUtils, TestCelebornShuffleManager} - +import org.apache.spark.shuffle.celeborn.{CelebornShuffleHandle, ShuffleManagerHook, SparkCommonUtils, SparkShuffleManager, SparkUtils, TestCelebornShuffleManager} import org.apache.celeborn.client.{LifecycleManager, ShuffleClient} import org.apache.celeborn.client.commit.ReducePartitionCommitHandler import org.apache.celeborn.common.CelebornConf @@ -113,7 +112,7 @@ class FileDeletionShuffleReaderGetHook( h.extension) val celebornShuffleId = SparkUtils.celebornShuffleId(shuffleClient, h, context, false) val appShuffleIdentifier = - SparkUtils.getAppShuffleIdentifier(handle.shuffleId, context) + SparkCommonUtils.encodeAppShuffleIdentifier(handle.shuffleId, context) val Array(_, stageId, _) = appShuffleIdentifier.split('-') if (triggerStageId.isEmpty || triggerStageId.get == stageId.toInt) { if (shuffleIdToBeDeleted.isEmpty) { diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/TestRunningStageManager.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/TestRunningStageManager.scala index 6a75398a24b..3a6a0a8e295 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/TestRunningStageManager.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/TestRunningStageManager.scala @@ -18,22 +18,21 @@ package org.apache.celeborn.tests.spark.fetch_failure import scala.collection.mutable -import org.apache.spark.scheduler.{RunningStageManager, SparkSchedulerHelper} +import org.apache.spark.RunningStageManager +import org.apache.spark.scheduler.SparkSchedulerHelper class TestRunningStageManager extends RunningStageManager { import TestRunningStageManager._ - def setRunningStages(stageIds: Seq[Int]): Unit = { - stageIds.foreach(stageId => runningStages += stageId) - } + override def isRunningStage(stageId: Int): Boolean = { if (runningStages.contains(stageId)) { - println(s"instrumented running stages contains $stageId") true } else { SparkSchedulerHelper.runningStages.map(_.id).contains(stageId) } } } + object TestRunningStageManager { val runningStages = new mutable.HashSet[Int] } From c121ece4ce52dc6e6ef0311e4abd943460ec8b22 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Mon, 21 Apr 2025 10:10:23 -0700 Subject: [PATCH 055/120] stylistic fixes --- .../org/apache/celeborn/spark/FailedShuffleCleaner.scala | 4 ++-- .../src/main/scala/org/apache/spark/RunningStageManager.scala | 1 - .../apache/spark/shuffle/celeborn/SparkShuffleManager.java | 2 +- .../java/org/apache/spark/shuffle/celeborn/SparkUtils.java | 4 ++-- .../tests/spark/fetch_failure/ShuffleReaderGetHooks.scala | 1 + 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala index a55d222729b..28d804c9b9b 100644 --- a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala +++ b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala @@ -24,12 +24,12 @@ import scala.collection.JavaConverters._ import scala.collection.mutable import org.apache.spark.{RunningStageManager, RunningStageManagerImpl} +import org.apache.spark.shuffle.celeborn.SparkCommonUtils + import org.apache.celeborn.client.LifecycleManager import org.apache.celeborn.common.internal.Logging import org.apache.celeborn.common.util.ThreadUtils -import org.apache.spark.shuffle.celeborn.SparkCommonUtils - private[celeborn] object FailedShuffleCleaner extends Logging { private val lifecycleManager = new AtomicReference[LifecycleManager](null) diff --git a/client-spark/common/src/main/scala/org/apache/spark/RunningStageManager.scala b/client-spark/common/src/main/scala/org/apache/spark/RunningStageManager.scala index 6ac5167d3c0..9d57adfdd5f 100644 --- a/client-spark/common/src/main/scala/org/apache/spark/RunningStageManager.scala +++ b/client-spark/common/src/main/scala/org/apache/spark/RunningStageManager.scala @@ -29,7 +29,6 @@ class RunningStageManagerImpl extends RunningStageManager { private val idField = stageClass.getDeclaredField("id") idField.setAccessible(true) - private def runningStages: mutable.HashSet[_] = { val dagSchedulerClz = SparkContext.getActive.get.dagScheduler.getClass val runningStagesField = dagSchedulerClz.getDeclaredField("runningStages") diff --git a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java index 821b8d13a43..5ec740204c2 100644 --- a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java +++ b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java @@ -21,7 +21,6 @@ import java.util.Objects; import java.util.concurrent.ConcurrentHashMap; -import org.apache.celeborn.spark.FailedShuffleCleaner; import org.apache.spark.*; import org.apache.spark.internal.config.package$; import org.apache.spark.launcher.SparkLauncher; @@ -38,6 +37,7 @@ import org.apache.celeborn.common.CelebornConf; import org.apache.celeborn.common.protocol.ShuffleMode; import org.apache.celeborn.reflect.DynMethods; +import org.apache.celeborn.spark.FailedShuffleCleaner; /** * In order to support Spark Stage resubmit with ShuffleReader FetchFails, Celeborn shuffleId has to diff --git a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java index b485493a952..42a0c8207aa 100644 --- a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java +++ b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java @@ -326,8 +326,8 @@ public static void addFailureListenerIfBarrierTask( if (!(taskContext instanceof BarrierTaskContext)) return; int appShuffleId = handle.shuffleId(); - String appShuffleIdentifier = SparkCommonUtils.encodeAppShuffleIdentifier( - appShuffleId, taskContext); + String appShuffleIdentifier = + SparkCommonUtils.encodeAppShuffleIdentifier(appShuffleId, taskContext); BarrierTaskContext barrierContext = (BarrierTaskContext) taskContext; barrierContext.addTaskFailureListener( diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/ShuffleReaderGetHooks.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/ShuffleReaderGetHooks.scala index fb5f43f0e82..83b8df4625f 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/ShuffleReaderGetHooks.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/ShuffleReaderGetHooks.scala @@ -23,6 +23,7 @@ import java.util.concurrent.atomic.AtomicBoolean import org.apache.spark.{SparkEnv, TaskContext} import org.apache.spark.shuffle.ShuffleHandle import org.apache.spark.shuffle.celeborn.{CelebornShuffleHandle, ShuffleManagerHook, SparkCommonUtils, SparkShuffleManager, SparkUtils, TestCelebornShuffleManager} + import org.apache.celeborn.client.{LifecycleManager, ShuffleClient} import org.apache.celeborn.client.commit.ReducePartitionCommitHandler import org.apache.celeborn.common.CelebornConf From 33b4145cfa4618d0242e338a24e6395a4534f1af Mon Sep 17 00:00:00 2001 From: CodingCat Date: Fri, 25 Apr 2025 09:48:20 -0700 Subject: [PATCH 056/120] addr comments --- .../celeborn/RunningStageManagerImpl.java | 51 +++++++++++++++++++ .../celeborn/spark/FailedShuffleCleaner.scala | 3 +- .../spark/RunningStageManager.scala | 27 +--------- .../TestRunningStageManager.scala | 3 +- .../org/apache/spark/SparkContextHelper.scala | 0 5 files changed, 55 insertions(+), 29 deletions(-) create mode 100644 client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/RunningStageManagerImpl.java rename client-spark/common/src/main/scala/org/apache/{ => celeborn}/spark/RunningStageManager.scala (51%) rename {client-spark/common/src/main => tests/spark-it/src/test}/scala/org/apache/spark/SparkContextHelper.scala (100%) diff --git a/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/RunningStageManagerImpl.java b/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/RunningStageManagerImpl.java new file mode 100644 index 00000000000..1475ec12a0f --- /dev/null +++ b/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/RunningStageManagerImpl.java @@ -0,0 +1,51 @@ +package org.apache.spark.shuffle.celeborn; + +import java.lang.reflect.Field; +import java.util.HashSet; + +import org.apache.spark.SparkContext$; +import org.apache.spark.scheduler.DAGScheduler; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.celeborn.spark.RunningStageManager; + +public class RunningStageManagerImpl implements RunningStageManager { + + private static final Logger LOG = LoggerFactory.getLogger(RunningStageManagerImpl.class); + private final Field idField; + + public RunningStageManagerImpl() + throws ClassNotFoundException, NoSuchFieldException, IllegalAccessException { + Class stageClass = Class.forName("org.apache.spark.scheduler.Stage"); + idField = stageClass.getDeclaredField("id"); + idField.setAccessible(true); + } + + private HashSet runningStages() { + try { + DAGScheduler dagScheduler = SparkContext$.MODULE$.getActive().get().dagScheduler(); + Class dagSchedulerClz = SparkContext$.MODULE$.getActive().get().dagScheduler().getClass(); + Field runningStagesField = dagSchedulerClz.getDeclaredField("runningStages"); + return (HashSet) runningStagesField.get(dagScheduler); + } catch (Exception e) { + LOG.error("cannot get running stages", e); + return new HashSet<>(); + } + } + + public boolean isRunningStage(int stageId) { + try { + for (Object stage : runningStages()) { + int currentStageId = (Integer) idField.get(stage); + if (currentStageId == stageId) { + return true; + } + } + return false; + } catch (Exception e) { + LOG.error("unexpected exception when checking whether it is running stage ", e); + return true; + } + } +} diff --git a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala index 28d804c9b9b..4694926a093 100644 --- a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala +++ b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala @@ -23,8 +23,7 @@ import java.util.concurrent.atomic.AtomicReference import scala.collection.JavaConverters._ import scala.collection.mutable -import org.apache.spark.{RunningStageManager, RunningStageManagerImpl} -import org.apache.spark.shuffle.celeborn.SparkCommonUtils +import org.apache.spark.shuffle.celeborn.{RunningStageManagerImpl, SparkCommonUtils} import org.apache.celeborn.client.LifecycleManager import org.apache.celeborn.common.internal.Logging diff --git a/client-spark/common/src/main/scala/org/apache/spark/RunningStageManager.scala b/client-spark/common/src/main/scala/org/apache/celeborn/spark/RunningStageManager.scala similarity index 51% rename from client-spark/common/src/main/scala/org/apache/spark/RunningStageManager.scala rename to client-spark/common/src/main/scala/org/apache/celeborn/spark/RunningStageManager.scala index 9d57adfdd5f..b70851583c5 100644 --- a/client-spark/common/src/main/scala/org/apache/spark/RunningStageManager.scala +++ b/client-spark/common/src/main/scala/org/apache/celeborn/spark/RunningStageManager.scala @@ -14,33 +14,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.spark - -import scala.collection.mutable +package org.apache.celeborn.spark trait RunningStageManager { def isRunningStage(stageId: Int): Boolean } - -class RunningStageManagerImpl extends RunningStageManager { - - private val stageClass = Class.forName("org.apache.spark.scheduler.Stage") - - private val idField = stageClass.getDeclaredField("id") - idField.setAccessible(true) - - private def runningStages: mutable.HashSet[_] = { - val dagSchedulerClz = SparkContext.getActive.get.dagScheduler.getClass - val runningStagesField = dagSchedulerClz.getDeclaredField("runningStages") - runningStagesField.setAccessible(true) - runningStagesField.get(SparkContext.getActive.get.dagScheduler) - .asInstanceOf[mutable.HashSet[_]] - } - - override def isRunningStage(stageId: Int): Boolean = { - runningStages.map { stage => - val stageId = idField.get(stage).asInstanceOf[Int] - stageId - }.contains(stageId) - } -} diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/TestRunningStageManager.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/TestRunningStageManager.scala index 3a6a0a8e295..ec0aaf56139 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/TestRunningStageManager.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/TestRunningStageManager.scala @@ -18,9 +18,10 @@ package org.apache.celeborn.tests.spark.fetch_failure import scala.collection.mutable -import org.apache.spark.RunningStageManager import org.apache.spark.scheduler.SparkSchedulerHelper +import org.apache.celeborn.spark.RunningStageManager + class TestRunningStageManager extends RunningStageManager { import TestRunningStageManager._ diff --git a/client-spark/common/src/main/scala/org/apache/spark/SparkContextHelper.scala b/tests/spark-it/src/test/scala/org/apache/spark/SparkContextHelper.scala similarity index 100% rename from client-spark/common/src/main/scala/org/apache/spark/SparkContextHelper.scala rename to tests/spark-it/src/test/scala/org/apache/spark/SparkContextHelper.scala From c6e2f812caafba702911cc34d9127862dd1ac381 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Fri, 25 Apr 2025 09:50:12 -0700 Subject: [PATCH 057/120] license --- .../celeborn/RunningStageManagerImpl.java | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/RunningStageManagerImpl.java b/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/RunningStageManagerImpl.java index 1475ec12a0f..236823e49c9 100644 --- a/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/RunningStageManagerImpl.java +++ b/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/RunningStageManagerImpl.java @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.spark.shuffle.celeborn; import java.lang.reflect.Field; From 5adfe0b726ecae05e05c9cc5d19c73328334180c Mon Sep 17 00:00:00 2001 From: CodingCat Date: Sun, 27 Apr 2025 20:46:38 -0700 Subject: [PATCH 058/120] addr comments --- .../scala/org/apache/celeborn/common/CelebornConf.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala b/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala index ec848dda517..86515916794 100644 --- a/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala +++ b/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala @@ -4825,13 +4825,13 @@ object CelebornConf extends Logging { .createWithDefault(false) val CLIENT_FETCH_CLEAN_FAILED_SHUFFLE_INTERVAL: ConfigEntry[Long] = - buildConf("celeborn.client.spark.fetch.cleanFailedShuffleIntervalMs") + buildConf("celeborn.client.spark.fetch.cleanFailedShuffleInterval") .categories("client") .version("0.6.0") .doc("the interval to clean the failed-to-fetch shuffle files, only valid when" + s" ${CLIENT_FETCH_CLEAN_FAILED_SHUFFLE.key} is enabled") - .longConf - .createWithDefault(1000) + .timeConf(TimeUnit.MILLISECONDS) + .createWithDefaultString("1s") val CLIENT_FETCH_EXCLUDE_WORKER_ON_FAILURE_ENABLED: ConfigEntry[Boolean] = buildConf("celeborn.client.fetch.excludeWorkerOnFailure.enabled") From a948f92cda1bcaaa58206756dfbfbda3cec0265f Mon Sep 17 00:00:00 2001 From: CodingCat Date: Sun, 27 Apr 2025 21:02:56 -0700 Subject: [PATCH 059/120] update param doc --- docs/configuration/client.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/configuration/client.md b/docs/configuration/client.md index 612844989c5..e4e8e0e83ee 100644 --- a/docs/configuration/client.md +++ b/docs/configuration/client.md @@ -112,7 +112,7 @@ license: | | celeborn.client.shuffle.reviseLostShuffles.enabled | false | false | Whether to revise lost shuffles. | 0.6.0 | | | celeborn.client.slot.assign.maxWorkers | 10000 | false | Max workers that slots of one shuffle can be allocated on. Will choose the smaller positive one from Master side and Client side, see `celeborn.master.slot.assign.maxWorkers`. | 0.3.1 | | | celeborn.client.spark.fetch.cleanFailedShuffle | false | false | whether to clean those disk space occupied by shuffles which cannot be fetched | 0.6.0 | | -| celeborn.client.spark.fetch.cleanFailedShuffleIntervalMs | 1000 | false | the interval to clean the failed-to-fetch shuffle files, only valid when celeborn.client.spark.fetch.cleanFailedShuffle is enabled | 0.6.0 | | +| celeborn.client.spark.fetch.cleanFailedShuffleInterval | 1s | false | the interval to clean the failed-to-fetch shuffle files, only valid when celeborn.client.spark.fetch.cleanFailedShuffle is enabled | 0.6.0 | | | celeborn.client.spark.push.dynamicWriteMode.enabled | false | false | Whether to dynamically switch push write mode based on conditions.If true, shuffle mode will be only determined by partition count | 0.5.0 | | | celeborn.client.spark.push.dynamicWriteMode.partitionNum.threshold | 2000 | false | Threshold of shuffle partition number for dynamically switching push writer mode. When the shuffle partition number is greater than this value, use the sort-based shuffle writer for memory efficiency; otherwise use the hash-based shuffle writer for speed. This configuration only takes effect when celeborn.client.spark.push.dynamicWriteMode.enabled is true. | 0.5.0 | | | celeborn.client.spark.push.sort.memory.maxMemoryFactor | 0.4 | false | the max portion of executor memory which can be used for SortBasedWriter buffer (only valid when celeborn.client.spark.push.sort.memory.useAdaptiveThreshold is enabled | 0.5.0 | | From fc5142a34521058fa82c4b11e1255a47b23b3cd9 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Mon, 28 Apr 2025 22:09:16 -0700 Subject: [PATCH 060/120] addr comments --- .../celeborn/RunningStageManagerImpl.java | 21 +++++------ .../celeborn/spark/FailedShuffleCleaner.scala | 35 +++++++++++-------- .../shuffle/celeborn/SparkShuffleManager.java | 1 + 3 files changed, 30 insertions(+), 27 deletions(-) diff --git a/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/RunningStageManagerImpl.java b/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/RunningStageManagerImpl.java index 236823e49c9..cbcd3c91fba 100644 --- a/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/RunningStageManagerImpl.java +++ b/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/RunningStageManagerImpl.java @@ -17,34 +17,29 @@ package org.apache.spark.shuffle.celeborn; -import java.lang.reflect.Field; import java.util.HashSet; import org.apache.spark.SparkContext$; import org.apache.spark.scheduler.DAGScheduler; +import org.apache.spark.scheduler.Stage; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.celeborn.reflect.DynFields; import org.apache.celeborn.spark.RunningStageManager; public class RunningStageManagerImpl implements RunningStageManager { private static final Logger LOG = LoggerFactory.getLogger(RunningStageManagerImpl.class); - private final Field idField; - - public RunningStageManagerImpl() - throws ClassNotFoundException, NoSuchFieldException, IllegalAccessException { - Class stageClass = Class.forName("org.apache.spark.scheduler.Stage"); - idField = stageClass.getDeclaredField("id"); - idField.setAccessible(true); - } + private static final DynFields.UnboundField idField = + DynFields.builder().hiddenImpl(Stage.class, "id").build(); private HashSet runningStages() { try { DAGScheduler dagScheduler = SparkContext$.MODULE$.getActive().get().dagScheduler(); - Class dagSchedulerClz = SparkContext$.MODULE$.getActive().get().dagScheduler().getClass(); - Field runningStagesField = dagSchedulerClz.getDeclaredField("runningStages"); - return (HashSet) runningStagesField.get(dagScheduler); + DynFields.UnboundField runningStagesField = + DynFields.builder().hiddenImpl(DAGScheduler.class, "runningStages").build(); + return (HashSet) runningStagesField.bind(dagScheduler).get(); } catch (Exception e) { LOG.error("cannot get running stages", e); return new HashSet<>(); @@ -54,7 +49,7 @@ private HashSet runningStages() { public boolean isRunningStage(int stageId) { try { for (Object stage : runningStages()) { - int currentStageId = (Integer) idField.get(stage); + int currentStageId = (Integer) idField.bind(stage).get(); if (currentStageId == stageId) { return true; } diff --git a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala index 4694926a093..e70ebbca2a3 100644 --- a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala +++ b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala @@ -17,7 +17,7 @@ package org.apache.celeborn.spark import java.util -import java.util.concurrent.{ConcurrentHashMap, LinkedBlockingQueue, TimeUnit} +import java.util.concurrent.{ConcurrentHashMap, LinkedBlockingQueue, ScheduledExecutorService, TimeUnit} import java.util.concurrent.atomic.AtomicReference import scala.collection.JavaConverters._ @@ -66,9 +66,10 @@ private[celeborn] object FailedShuffleCleaner extends Logging { cleanedShuffleIds.clear() celebornShuffleIdToReferringStages.clear() runningStageManager = buildRunningStageChecker() - cleanerThreadPool.shutdownNow() - cleanerThreadPool = ThreadUtils.newDaemonSingleThreadScheduledExecutor( - "failedShuffleCleanerThreadPool") + if (cleanerThreadPool != null) { + cleanerThreadPool.shutdownNow() + } + } def addShuffleIdReferringStage(celebornShuffleId: Int, appShuffleIdentifier: String): Unit = { @@ -116,18 +117,25 @@ private[celeborn] object FailedShuffleCleaner extends Logging { def setLifecycleManager(ref: LifecycleManager): Unit = { val firstSet = lifecycleManager.compareAndSet(null, ref) if (firstSet) { + cleanerThreadPool = ThreadUtils.newDaemonSingleThreadScheduledExecutor( + "failedShuffleCleanerThreadPool") cleanerThreadPool.scheduleWithFixedDelay( new Runnable { override def run(): Unit = { - val allShuffleIds = new util.ArrayList[Int] - shufflesToBeCleand.drainTo(allShuffleIds) - allShuffleIds.asScala.foreach { shuffleId => - if (!cleanedShuffleIds.contains(shuffleId)) { - lifecycleManager.get().unregisterShuffle(shuffleId) - logInfo( - s"sent unregister shuffle request for shuffle $shuffleId (celeborn shuffle id)") - cleanedShuffleIds += shuffleId + try { + val allShuffleIds = new util.ArrayList[Int] + shufflesToBeCleand.drainTo(allShuffleIds) + allShuffleIds.asScala.foreach { shuffleId => + if (!cleanedShuffleIds.contains(shuffleId)) { + lifecycleManager.get().unregisterShuffle(shuffleId) + logInfo( + s"sent unregister shuffle request for shuffle $shuffleId (celeborn shuffle id)") + cleanedShuffleIds += shuffleId + } } + } catch { + case e: Exception => + logError("unexpected exception in cleaner thread", e) } } }, @@ -156,6 +164,5 @@ private[celeborn] object FailedShuffleCleaner extends Logging { ret } - private var cleanerThreadPool = ThreadUtils.newDaemonSingleThreadScheduledExecutor( - "failedShuffleCleanerThreadPool") + private var cleanerThreadPool: ScheduledExecutorService = _ } diff --git a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java index 5ec740204c2..6881009b001 100644 --- a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java +++ b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java @@ -151,6 +151,7 @@ private void initializeLifecycleManager(String appId) { lifecycleManager.registerShuffleTrackerCallback( shuffleId -> SparkUtils.unregisterAllMapOutput(mapOutputTracker, shuffleId)); + if (celebornConf.clientAdaptiveOptimizeSkewedPartitionReadEnabled()) { lifecycleManager.registerCelebornSkewShuffleCheckCallback( SparkUtils::isCelebornSkewShuffleOrChildShuffle); From e6586413ed5141e4f8fbfd079ac4e313fcc765a8 Mon Sep 17 00:00:00 2001 From: "Wang, Fei" Date: Tue, 29 Apr 2025 16:14:03 -0700 Subject: [PATCH 061/120] comments --- .../shuffle/celeborn/RunningStageManagerImpl.java | 10 +++++----- .../apache/celeborn/spark/FailedShuffleCleaner.scala | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/RunningStageManagerImpl.java b/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/RunningStageManagerImpl.java index cbcd3c91fba..66669a95225 100644 --- a/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/RunningStageManagerImpl.java +++ b/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/RunningStageManagerImpl.java @@ -31,15 +31,15 @@ public class RunningStageManagerImpl implements RunningStageManager { private static final Logger LOG = LoggerFactory.getLogger(RunningStageManagerImpl.class); - private static final DynFields.UnboundField idField = + private static final DynFields.UnboundField id_FIELD = DynFields.builder().hiddenImpl(Stage.class, "id").build(); + private static final DynFields.UnboundField runningStages_FIELD = + DynFields.builder().hiddenImpl(DAGScheduler.class, "runningStages").build(); private HashSet runningStages() { try { DAGScheduler dagScheduler = SparkContext$.MODULE$.getActive().get().dagScheduler(); - DynFields.UnboundField runningStagesField = - DynFields.builder().hiddenImpl(DAGScheduler.class, "runningStages").build(); - return (HashSet) runningStagesField.bind(dagScheduler).get(); + return (HashSet) runningStages_FIELD.bind(dagScheduler).get(); } catch (Exception e) { LOG.error("cannot get running stages", e); return new HashSet<>(); @@ -49,7 +49,7 @@ private HashSet runningStages() { public boolean isRunningStage(int stageId) { try { for (Object stage : runningStages()) { - int currentStageId = (Integer) idField.bind(stage).get(); + int currentStageId = (Integer) id_FIELD.bind(stage).get(); if (currentStageId == stageId) { return true; } diff --git a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala index e70ebbca2a3..97696c9a003 100644 --- a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala +++ b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala @@ -33,7 +33,7 @@ private[celeborn] object FailedShuffleCleaner extends Logging { private val lifecycleManager = new AtomicReference[LifecycleManager](null) // in celeborn ids - private val shufflesToBeCleand = new LinkedBlockingQueue[Int]() + private val shufflesToBeCleaned = new LinkedBlockingQueue[Int]() private val cleanedShuffleIds = new mutable.HashSet[Int] // celeborn shuffle id to stage id referred to it private[celeborn] val celebornShuffleIdToReferringStages = @@ -62,14 +62,14 @@ private[celeborn] object FailedShuffleCleaner extends Logging { // for test def reset(): Unit = { lifecycleManager.set(null) - shufflesToBeCleand.clear() + shufflesToBeCleaned.clear() cleanedShuffleIds.clear() celebornShuffleIdToReferringStages.clear() runningStageManager = buildRunningStageChecker() if (cleanerThreadPool != null) { cleanerThreadPool.shutdownNow() + cleanerThreadPool = null } - } def addShuffleIdReferringStage(celebornShuffleId: Int, appShuffleIdentifier: String): Unit = { @@ -99,7 +99,7 @@ private[celeborn] object FailedShuffleCleaner extends Logging { || onlyCurrentStageReferred(celebornShuffleId, stageId.toInt) || noRunningDownstreamStage(celebornShuffleId) || !committedSuccessfully(celebornShuffleId)) { - shufflesToBeCleand.put(celebornShuffleId) + shufflesToBeCleaned.put(celebornShuffleId) } } } @@ -124,7 +124,7 @@ private[celeborn] object FailedShuffleCleaner extends Logging { override def run(): Unit = { try { val allShuffleIds = new util.ArrayList[Int] - shufflesToBeCleand.drainTo(allShuffleIds) + shufflesToBeCleaned.drainTo(allShuffleIds) allShuffleIds.asScala.foreach { shuffleId => if (!cleanedShuffleIds.contains(shuffleId)) { lifecycleManager.get().unregisterShuffle(shuffleId) From f407158e816b2cd9d6a0015c1df640d9618f06df Mon Sep 17 00:00:00 2001 From: CodingCat Date: Tue, 29 Apr 2025 17:01:17 -0700 Subject: [PATCH 062/120] ensure type safe --- .../spark/shuffle/celeborn/RunningStageManagerImpl.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/RunningStageManagerImpl.java b/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/RunningStageManagerImpl.java index 66669a95225..6c7c12ded91 100644 --- a/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/RunningStageManagerImpl.java +++ b/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/RunningStageManagerImpl.java @@ -19,6 +19,8 @@ import java.util.HashSet; +import scala.collection.JavaConverters; + import org.apache.spark.SparkContext$; import org.apache.spark.scheduler.DAGScheduler; import org.apache.spark.scheduler.Stage; @@ -39,7 +41,9 @@ public class RunningStageManagerImpl implements RunningStageManager { private HashSet runningStages() { try { DAGScheduler dagScheduler = SparkContext$.MODULE$.getActive().get().dagScheduler(); - return (HashSet) runningStages_FIELD.bind(dagScheduler).get(); + return new HashSet<>( + JavaConverters.setAsJavaSet( + (scala.collection.mutable.HashSet) runningStages_FIELD.bind(dagScheduler).get())); } catch (Exception e) { LOG.error("cannot get running stages", e); return new HashSet<>(); From dbb1423d6c3c0aa197b7be6f1790a2a6baecf7cb Mon Sep 17 00:00:00 2001 From: CodingCat Date: Tue, 29 Apr 2025 17:24:43 -0700 Subject: [PATCH 063/120] make it compilable with spark 2 --- .../spark/shuffle/celeborn/RunningStageManagerImpl.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/RunningStageManagerImpl.java b/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/RunningStageManagerImpl.java index 6c7c12ded91..8311c2e867a 100644 --- a/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/RunningStageManagerImpl.java +++ b/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/RunningStageManagerImpl.java @@ -42,8 +42,10 @@ private HashSet runningStages() { try { DAGScheduler dagScheduler = SparkContext$.MODULE$.getActive().get().dagScheduler(); return new HashSet<>( - JavaConverters.setAsJavaSet( - (scala.collection.mutable.HashSet) runningStages_FIELD.bind(dagScheduler).get())); + JavaConverters.asJavaCollectionConverter( + (scala.collection.mutable.HashSet) + runningStages_FIELD.bind(dagScheduler).get()) + .asJavaCollection()); } catch (Exception e) { LOG.error("cannot get running stages", e); return new HashSet<>(); From d8ed3319da21981e30a738719acece1623057f40 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Wed, 30 Apr 2025 09:17:28 -0700 Subject: [PATCH 064/120] add unit test to guard runningstagemanagerimpl --- .../spark/CelebornFailedDiskCleanUtils.scala | 37 +++++++++++++++++++ .../CelebornFetchFailureDiskCleanSuite.scala | 3 ++ 2 files changed, 40 insertions(+) create mode 100644 tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFailedDiskCleanUtils.scala diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFailedDiskCleanUtils.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFailedDiskCleanUtils.scala new file mode 100644 index 00000000000..93de117d7f6 --- /dev/null +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFailedDiskCleanUtils.scala @@ -0,0 +1,37 @@ +package org.apache.celeborn.tests.spark + +import org.apache.celeborn.common.protocol.ShuffleMode +import org.apache.celeborn.spark.FailedShuffleCleaner + +import org.apache.spark.SparkConf +import org.apache.spark.sql.SparkSession + +class CelebornFailedDiskCleanUtils extends SparkTestBase { + test("test correctness of RunningStageManager") { + val sparkConf = new SparkConf().setAppName("rss-demo").setMaster("local[2,3]") + val sparkSession = SparkSession.builder() + .config(updateSparkConf(sparkConf, ShuffleMode.HASH)) + .config("spark.sql.shuffle.partitions", 2) + .config("spark.celeborn.shuffle.forceFallback.partition.enabled", false) + .config("spark.celeborn.client.spark.fetch.throwsFetchFailure", "true") + .config( + "spark.shuffle.manager", + "org.apache.spark.shuffle.celeborn.TestCelebornShuffleManager") + .getOrCreate() + val t = new Thread { + override def run(): Unit = { + try { + sparkSession.sparkContext.parallelize(List(1, 2, 3)).foreach(_ => + Thread.sleep(60 * 1000)) + } catch { + case _: Throwable => + // swallow everything + } + } + } + t.start() + Thread.sleep(20 * 1000) + assert(FailedShuffleCleaner.runningStageManager.isRunningStage(0)) + sparkSession.stop() + } +} diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala index 441992c1a78..6287a3d671c 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala @@ -169,6 +169,7 @@ class CelebornFetchFailureDiskCleanSuite extends FetchFailureDiskCleanBase { val expect = "[2,1]" assert(tuple.mkString("[", ",", "]").equals(expect)) sparkSession.stop() + System.clearProperty(FailedShuffleCleaner.RUNNING_STAGE_CHECKER_CLASS) } } @@ -198,6 +199,7 @@ class CelebornFetchFailureDiskCleanSuite extends FetchFailureDiskCleanBase { val expect = "[2,1]" assert(tuples.mkString("[", ",", "]").equals(expect)) sparkSession.stop() + System.clearProperty(FailedShuffleCleaner.RUNNING_STAGE_CHECKER_CLASS) } } @@ -229,6 +231,7 @@ class CelebornFetchFailureDiskCleanSuite extends FetchFailureDiskCleanBase { val expect = "[2,1]" assert(tuple.mkString("[", ",", "]").equals(expect)) sparkSession.stop() + System.clearProperty(FailedShuffleCleaner.RUNNING_STAGE_CHECKER_CLASS) } } } From c896f477c6bdbcdefdc75db5f131a2bd6259e925 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Wed, 30 Apr 2025 09:25:38 -0700 Subject: [PATCH 065/120] add unit test --- .../celeborn/tests/spark/CelebornFailedDiskCleanUtils.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFailedDiskCleanUtils.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFailedDiskCleanUtils.scala index 93de117d7f6..9450c162897 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFailedDiskCleanUtils.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFailedDiskCleanUtils.scala @@ -1,11 +1,11 @@ package org.apache.celeborn.tests.spark -import org.apache.celeborn.common.protocol.ShuffleMode -import org.apache.celeborn.spark.FailedShuffleCleaner - import org.apache.spark.SparkConf import org.apache.spark.sql.SparkSession +import org.apache.celeborn.common.protocol.ShuffleMode +import org.apache.celeborn.spark.FailedShuffleCleaner + class CelebornFailedDiskCleanUtils extends SparkTestBase { test("test correctness of RunningStageManager") { val sparkConf = new SparkConf().setAppName("rss-demo").setMaster("local[2,3]") From 6ab91a71381e085c93218fcb73b10ee6c0603377 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Wed, 30 Apr 2025 09:28:44 -0700 Subject: [PATCH 066/120] add header --- .../spark/CelebornFailedDiskCleanUtils.scala | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFailedDiskCleanUtils.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFailedDiskCleanUtils.scala index 9450c162897..b4beaf15cd6 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFailedDiskCleanUtils.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFailedDiskCleanUtils.scala @@ -1,3 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.celeborn.tests.spark import org.apache.spark.SparkConf From e0c2ee77af75c2e2944c5b94e5b689b8b46dd7fd Mon Sep 17 00:00:00 2001 From: CodingCat Date: Wed, 30 Apr 2025 12:30:38 -0700 Subject: [PATCH 067/120] update test --- .../spark/CelebornFailedDiskCleanUtils.scala | 27 +++++++++++-------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFailedDiskCleanUtils.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFailedDiskCleanUtils.scala index b4beaf15cd6..32f44b299f2 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFailedDiskCleanUtils.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFailedDiskCleanUtils.scala @@ -18,6 +18,9 @@ package org.apache.celeborn.tests.spark import org.apache.spark.SparkConf import org.apache.spark.sql.SparkSession +import org.scalatest.concurrent.Eventually.eventually +import org.scalatest.concurrent.Futures.{interval, timeout} +import org.scalatest.time.SpanSugar.convertIntToGrainOfTime import org.apache.celeborn.common.protocol.ShuffleMode import org.apache.celeborn.spark.FailedShuffleCleaner @@ -34,20 +37,22 @@ class CelebornFailedDiskCleanUtils extends SparkTestBase { "spark.shuffle.manager", "org.apache.spark.shuffle.celeborn.TestCelebornShuffleManager") .getOrCreate() - val t = new Thread { - override def run(): Unit = { - try { - sparkSession.sparkContext.parallelize(List(1, 2, 3)).foreach(_ => - Thread.sleep(60 * 1000)) - } catch { - case _: Throwable => - // swallow everything + var t: Thread = null + eventually(timeout(20.seconds), interval(100.milliseconds)) { + t = new Thread { + override def run(): Unit = { + try { + sparkSession.sparkContext.parallelize(List(1, 2, 3)).foreach(_ => + Thread.sleep(60 * 1000)) + } catch { + case _: Throwable => + // swallow everything + } } } + t.start() + assert(FailedShuffleCleaner.runningStageManager.isRunningStage(0)) } - t.start() - Thread.sleep(20 * 1000) - assert(FailedShuffleCleaner.runningStageManager.isRunningStage(0)) sparkSession.stop() } } From b7249599013ec4aedfd95981c420491d848d6fb5 Mon Sep 17 00:00:00 2001 From: "Wang, Fei" Date: Wed, 30 Apr 2025 13:49:28 -0700 Subject: [PATCH 068/120] RunningStageManager UT --- ...> CelebornFailedDiskCleanUtilsSuite.scala} | 33 +++++++++++++------ 1 file changed, 23 insertions(+), 10 deletions(-) rename tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/{CelebornFailedDiskCleanUtils.scala => CelebornFailedDiskCleanUtilsSuite.scala} (72%) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFailedDiskCleanUtils.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFailedDiskCleanUtilsSuite.scala similarity index 72% rename from tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFailedDiskCleanUtils.scala rename to tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFailedDiskCleanUtilsSuite.scala index 32f44b299f2..e881d49d49f 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFailedDiskCleanUtils.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFailedDiskCleanUtilsSuite.scala @@ -25,8 +25,9 @@ import org.scalatest.time.SpanSugar.convertIntToGrainOfTime import org.apache.celeborn.common.protocol.ShuffleMode import org.apache.celeborn.spark.FailedShuffleCleaner -class CelebornFailedDiskCleanUtils extends SparkTestBase { +class CelebornFailedDiskCleanUtilsSuite extends SparkTestBase { test("test correctness of RunningStageManager") { + System.clearProperty(FailedShuffleCleaner.RUNNING_STAGE_CHECKER_CLASS) val sparkConf = new SparkConf().setAppName("rss-demo").setMaster("local[2,3]") val sparkSession = SparkSession.builder() .config(updateSparkConf(sparkConf, ShuffleMode.HASH)) @@ -37,22 +38,34 @@ class CelebornFailedDiskCleanUtils extends SparkTestBase { "spark.shuffle.manager", "org.apache.spark.shuffle.celeborn.TestCelebornShuffleManager") .getOrCreate() - var t: Thread = null - eventually(timeout(20.seconds), interval(100.milliseconds)) { - t = new Thread { + + try { + val t = new Thread { override def run(): Unit = { try { - sparkSession.sparkContext.parallelize(List(1, 2, 3)).foreach(_ => - Thread.sleep(60 * 1000)) + sparkSession.sparkContext.parallelize(List(1, 2, 3)).mapPartitions { iter => + Thread.sleep(60 * 1000) + iter + }.collect() } catch { - case _: Throwable => - // swallow everything + case _: InterruptedException => } } } t.start() - assert(FailedShuffleCleaner.runningStageManager.isRunningStage(0)) + + eventually(timeout(20.seconds), interval(100.milliseconds)) { + assert(FailedShuffleCleaner.runningStageManager.isRunningStage(0)) + } + + sparkSession.sparkContext.cancelAllJobs() + t.interrupt() + + eventually(timeout(10.seconds), interval(100.milliseconds)) { + assert(!FailedShuffleCleaner.runningStageManager.isRunningStage(0)) + } + } finally { + sparkSession.stop() } - sparkSession.stop() } } From ec215556c67aa0871ee1987015cba8c0d0514d60 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Wed, 30 Apr 2025 20:30:07 -0700 Subject: [PATCH 069/120] avoid using property --- .../celeborn/spark/FailedShuffleCleaner.scala | 17 ++++++------ .../apache/celeborn/common/CelebornConf.scala | 10 +++++++ .../CelebornFailedDiskCleanUtilsSuite.scala | 9 +++---- .../CelebornFetchFailureDiskCleanSuite.scala | 27 +++++++------------ .../FetchFailureDiskCleanBase.scala | 2 +- .../fetch_failure/FetchFailureTestBase.scala | 13 ++++++++- 6 files changed, 43 insertions(+), 35 deletions(-) diff --git a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala index 97696c9a003..870a7a829d6 100644 --- a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala +++ b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala @@ -44,24 +44,21 @@ private[celeborn] object FailedShuffleCleaner extends Logging { private lazy val cleanInterval = lifecycleManager.get().conf.clientFetchCleanFailedShuffleIntervalMS - val RUNNING_STAGE_CHECKER_CLASS = "CELEBORN_TEST_RUNNING_STAGE_CHECKER_IMPL" + private[celeborn] var runningStageManager: RunningStageManager = _ - private[celeborn] var runningStageManager: RunningStageManager = buildRunningStageChecker() - - // for testing private def buildRunningStageChecker(): RunningStageManager = { - if (System.getProperty(RUNNING_STAGE_CHECKER_CLASS) == null) { - new RunningStageManagerImpl - } else { - val className = System.getProperty(RUNNING_STAGE_CHECKER_CLASS) + val lifecycleMgrRef = lifecycleManager.get() + if (lifecycleMgrRef != null) { + val className = lifecycleManager.get().conf.clientFetchCleanFailedShuffleRunningMgrImpl val claz = Class.forName(className) claz.getDeclaredConstructor().newInstance().asInstanceOf[RunningStageManager] + } else { + null } } // for test def reset(): Unit = { - lifecycleManager.set(null) shufflesToBeCleaned.clear() cleanedShuffleIds.clear() celebornShuffleIdToReferringStages.clear() @@ -70,6 +67,7 @@ private[celeborn] object FailedShuffleCleaner extends Logging { cleanerThreadPool.shutdownNow() cleanerThreadPool = null } + lifecycleManager.set(null) } def addShuffleIdReferringStage(celebornShuffleId: Int, appShuffleIdentifier: String): Unit = { @@ -119,6 +117,7 @@ private[celeborn] object FailedShuffleCleaner extends Logging { if (firstSet) { cleanerThreadPool = ThreadUtils.newDaemonSingleThreadScheduledExecutor( "failedShuffleCleanerThreadPool") + runningStageManager = buildRunningStageChecker() cleanerThreadPool.scheduleWithFixedDelay( new Runnable { override def run(): Unit = { diff --git a/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala b/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala index 86515916794..14ec4637329 100644 --- a/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala +++ b/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala @@ -1000,6 +1000,8 @@ class CelebornConf(loadDefaults: Boolean) extends Cloneable with Logging with Se def clientFetchCleanFailedShuffle: Boolean = get(CLIENT_FETCH_CLEAN_FAILED_SHUFFLE) def clientFetchCleanFailedShuffleIntervalMS: Long = get(CLIENT_FETCH_CLEAN_FAILED_SHUFFLE_INTERVAL) + def clientFetchCleanFailedShuffleRunningMgrImpl: String = + get(CLIENT_FETCH_CLEAN_FAILED_SHUFFLE_RUNNING_STAGE_MGR_IMPL) def clientFetchExcludeWorkerOnFailureEnabled: Boolean = get(CLIENT_FETCH_EXCLUDE_WORKER_ON_FAILURE_ENABLED) def clientFetchExcludedWorkerExpireTimeout: Long = @@ -4824,6 +4826,14 @@ object CelebornConf extends Logging { .booleanConf .createWithDefault(false) + val CLIENT_FETCH_CLEAN_FAILED_SHUFFLE_RUNNING_STAGE_MGR_IMPL: ConfigEntry[String] = + buildConf("celeborn.client.spark.fetch.cleanFailedShuffle.runningStageManagerImpl") + .categories("client") + .version("0.6.0") + .doc("full class name of of running stage manager implementation, mainly for test") + .stringConf + .createWithDefault("org.apache.spark.shuffle.celeborn.RunningStageManagerImpl") + val CLIENT_FETCH_CLEAN_FAILED_SHUFFLE_INTERVAL: ConfigEntry[Long] = buildConf("celeborn.client.spark.fetch.cleanFailedShuffleInterval") .categories("client") diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFailedDiskCleanUtilsSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFailedDiskCleanUtilsSuite.scala index e881d49d49f..c5da775f949 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFailedDiskCleanUtilsSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFailedDiskCleanUtilsSuite.scala @@ -16,14 +16,14 @@ */ package org.apache.celeborn.tests.spark -import org.apache.spark.SparkConf -import org.apache.spark.sql.SparkSession +import org.apache.spark.SparkEnv +import org.apache.spark.shuffle.celeborn.SparkShuffleManager import org.scalatest.concurrent.Eventually.eventually import org.scalatest.concurrent.Futures.{interval, timeout} import org.scalatest.time.SpanSugar.convertIntToGrainOfTime -import org.apache.celeborn.common.protocol.ShuffleMode import org.apache.celeborn.spark.FailedShuffleCleaner +import org.apache.celeborn.tests.spark.fetch_failure.FetchFailureDiskCleanBase class CelebornFailedDiskCleanUtilsSuite extends SparkTestBase { test("test correctness of RunningStageManager") { @@ -49,9 +49,6 @@ class CelebornFailedDiskCleanUtilsSuite extends SparkTestBase { }.collect() } catch { case _: InterruptedException => - } - } - } t.start() eventually(timeout(20.seconds), interval(100.milliseconds)) { diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala index 6287a3d671c..8cfdc096aaa 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala @@ -143,15 +143,13 @@ class CelebornFetchFailureDiskCleanSuite extends FetchFailureDiskCleanBase { // 7. if the dependency is 1 to M , we should not clean it test("celeborn spark integration test - Do not clean up the shuffle files being referred by more than one stages") { if (Spark3OrNewer) { - System.setProperty( - FailedShuffleCleaner.RUNNING_STAGE_CHECKER_CLASS, - "org.apache.celeborn.tests.spark.fetch_failure.TestRunningStageManager") - FailedShuffleCleaner.reset() // create dummy running stages TestRunningStageManager.runningStages += 2 FailedShuffleCleaner.celebornShuffleIdToReferringStages.put(0, new mutable.HashSet[Int]) FailedShuffleCleaner.celebornShuffleIdToReferringStages.get(0) += 2 - val sparkSession = createSparkSession(enableFailedShuffleCleaner = true) + val sparkSession = createSparkSession( + enableFailedShuffleCleaner = true, + enableCustomizedRunningStageMgr = true) val celebornConf = SparkUtils.fromSparkConf(sparkSession.sparkContext.getConf) val hook = new FileDeletionShuffleReaderGetHook( celebornConf, @@ -169,22 +167,19 @@ class CelebornFetchFailureDiskCleanSuite extends FetchFailureDiskCleanBase { val expect = "[2,1]" assert(tuple.mkString("[", ",", "]").equals(expect)) sparkSession.stop() - System.clearProperty(FailedShuffleCleaner.RUNNING_STAGE_CHECKER_CLASS) } } // 8. if the dependency is 1 to M but failed in commit phase, we should just clean it test("celeborn spark integration test - clear the failed-to-commit shuffle file even it is referred by more than once") { if (Spark3OrNewer) { - System.setProperty( - FailedShuffleCleaner.RUNNING_STAGE_CHECKER_CLASS, - "org.apache.celeborn.tests.spark.fetch_failure.TestRunningStageManager") - FailedShuffleCleaner.reset() // create dummy running stages TestRunningStageManager.runningStages += 2 FailedShuffleCleaner.celebornShuffleIdToReferringStages.put(0, new mutable.HashSet[Int]) FailedShuffleCleaner.celebornShuffleIdToReferringStages.get(0) += 2 - val sparkSession = createSparkSession(enableFailedShuffleCleaner = true) + val sparkSession = createSparkSession( + enableFailedShuffleCleaner = true, + enableCustomizedRunningStageMgr = true) val celebornConf = SparkUtils.fromSparkConf(sparkSession.sparkContext.getConf) val hook = new FailCommitShuffleReaderGetHook(celebornConf) TestCelebornShuffleManager.registerReaderGetHook(hook) @@ -199,21 +194,18 @@ class CelebornFetchFailureDiskCleanSuite extends FetchFailureDiskCleanBase { val expect = "[2,1]" assert(tuples.mkString("[", ",", "]").equals(expect)) sparkSession.stop() - System.clearProperty(FailedShuffleCleaner.RUNNING_STAGE_CHECKER_CLASS) } } test("celeborn spark integration test - clean up the shuffle files if" + " the referring stage has finished") { if (Spark3OrNewer) { - System.setProperty( - FailedShuffleCleaner.RUNNING_STAGE_CHECKER_CLASS, - "org.apache.celeborn.tests.spark.fetch_failure.TestRunningStageManager") - FailedShuffleCleaner.reset() // create dummy running stages FailedShuffleCleaner.celebornShuffleIdToReferringStages.put(0, new mutable.HashSet[Int]) FailedShuffleCleaner.celebornShuffleIdToReferringStages.get(0) += 2 - val sparkSession = createSparkSession(enableFailedShuffleCleaner = true) + val sparkSession = createSparkSession( + enableFailedShuffleCleaner = true, + enableCustomizedRunningStageMgr = true) val celebornConf = SparkUtils.fromSparkConf(sparkSession.sparkContext.getConf) val hook = new FileDeletionShuffleReaderGetHook( celebornConf, @@ -231,7 +223,6 @@ class CelebornFetchFailureDiskCleanSuite extends FetchFailureDiskCleanBase { val expect = "[2,1]" assert(tuple.mkString("[", ",", "]").equals(expect)) sparkSession.stop() - System.clearProperty(FailedShuffleCleaner.RUNNING_STAGE_CHECKER_CLASS) } } } diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala index 846216087bd..df7703b0bf7 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala @@ -38,11 +38,11 @@ private[tests] trait FetchFailureDiskCleanBase extends AnyFunSuite override def beforeEach(): Unit = { ShuffleClient.reset() - FailedShuffleCleaner.reset() } override def afterEach(): Unit = { System.gc() + FailedShuffleCleaner.reset() } override def createWorker(map: Map[String, String]): Worker = { diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureTestBase.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureTestBase.scala index dd76ba66e65..1f2f84a4bd4 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureTestBase.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureTestBase.scala @@ -27,7 +27,8 @@ private[tests] trait FetchFailureTestBase extends SparkTestBase { def createSparkSession( overrideShuffleMgr: Boolean = true, - enableFailedShuffleCleaner: Boolean = false): SparkSession = { + enableFailedShuffleCleaner: Boolean = false, + enableCustomizedRunningStageMgr: Boolean = false): SparkSession = { val sparkConf = new SparkConf().setAppName({ if (!enableFailedShuffleCleaner) { "fetch-failure" @@ -58,6 +59,16 @@ private[tests] trait FetchFailureTestBase extends SparkTestBase { } else { baseBuilder } + + baseBuilder = + if (enableCustomizedRunningStageMgr) { + baseBuilder.config( + "spark.celeborn.client.spark.fetch.cleanFailedShuffle" + + ".runningStageManagerImpl", + "org.apache.celeborn.tests.spark.fetch_failure.TestRunningStageManager") + } else { + baseBuilder + } baseBuilder.getOrCreate() } } From 5d8ade0b029b208cc92f87f1039d411bb1d33f84 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Wed, 30 Apr 2025 20:42:14 -0700 Subject: [PATCH 070/120] merge --- .../CelebornFailedDiskCleanUtilsSuite.scala | 63 +++++++++---------- 1 file changed, 28 insertions(+), 35 deletions(-) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFailedDiskCleanUtilsSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFailedDiskCleanUtilsSuite.scala index c5da775f949..4361a1a84dd 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFailedDiskCleanUtilsSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFailedDiskCleanUtilsSuite.scala @@ -16,8 +16,6 @@ */ package org.apache.celeborn.tests.spark -import org.apache.spark.SparkEnv -import org.apache.spark.shuffle.celeborn.SparkShuffleManager import org.scalatest.concurrent.Eventually.eventually import org.scalatest.concurrent.Futures.{interval, timeout} import org.scalatest.time.SpanSugar.convertIntToGrainOfTime @@ -25,44 +23,39 @@ import org.scalatest.time.SpanSugar.convertIntToGrainOfTime import org.apache.celeborn.spark.FailedShuffleCleaner import org.apache.celeborn.tests.spark.fetch_failure.FetchFailureDiskCleanBase -class CelebornFailedDiskCleanUtilsSuite extends SparkTestBase { +class CelebornFailedDiskCleanUtilsSuite extends FetchFailureDiskCleanBase { test("test correctness of RunningStageManager") { - System.clearProperty(FailedShuffleCleaner.RUNNING_STAGE_CHECKER_CLASS) - val sparkConf = new SparkConf().setAppName("rss-demo").setMaster("local[2,3]") - val sparkSession = SparkSession.builder() - .config(updateSparkConf(sparkConf, ShuffleMode.HASH)) - .config("spark.sql.shuffle.partitions", 2) - .config("spark.celeborn.shuffle.forceFallback.partition.enabled", false) - .config("spark.celeborn.client.spark.fetch.throwsFetchFailure", "true") - .config( - "spark.shuffle.manager", - "org.apache.spark.shuffle.celeborn.TestCelebornShuffleManager") - .getOrCreate() + if (Spark3OrNewer) { + val sparkSession = createSparkSession(enableFailedShuffleCleaner = true) + sparkSession.sparkContext.parallelize(List(1, 2, 3)).repartition(1).count() + try { + val t = new Thread { + override def run(): Unit = { + try { + sparkSession.sparkContext.parallelize(List(1, 2, 3)).mapPartitions { iter => + Thread.sleep(60 * 1000) + iter + }.collect() + } catch { + case _: InterruptedException => + } + } + } + t.start() - try { - val t = new Thread { - override def run(): Unit = { - try { - sparkSession.sparkContext.parallelize(List(1, 2, 3)).mapPartitions { iter => - Thread.sleep(60 * 1000) - iter - }.collect() - } catch { - case _: InterruptedException => - t.start() + eventually(timeout(20.seconds), interval(100.milliseconds)) { + assert(FailedShuffleCleaner.runningStageManager.isRunningStage(2)) + } - eventually(timeout(20.seconds), interval(100.milliseconds)) { - assert(FailedShuffleCleaner.runningStageManager.isRunningStage(0)) - } - - sparkSession.sparkContext.cancelAllJobs() - t.interrupt() + sparkSession.sparkContext.cancelAllJobs() + t.interrupt() - eventually(timeout(10.seconds), interval(100.milliseconds)) { - assert(!FailedShuffleCleaner.runningStageManager.isRunningStage(0)) + eventually(timeout(10.seconds), interval(100.milliseconds)) { + assert(!FailedShuffleCleaner.runningStageManager.isRunningStage(2)) + } + } finally { + sparkSession.stop() } - } finally { - sparkSession.stop() } } } From ef55deffec3e355a187785d89afb7316d537a653 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Wed, 30 Apr 2025 21:01:56 -0700 Subject: [PATCH 071/120] param fix --- docs/configuration/client.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/configuration/client.md b/docs/configuration/client.md index e4e8e0e83ee..9e697ca48ba 100644 --- a/docs/configuration/client.md +++ b/docs/configuration/client.md @@ -112,6 +112,7 @@ license: | | celeborn.client.shuffle.reviseLostShuffles.enabled | false | false | Whether to revise lost shuffles. | 0.6.0 | | | celeborn.client.slot.assign.maxWorkers | 10000 | false | Max workers that slots of one shuffle can be allocated on. Will choose the smaller positive one from Master side and Client side, see `celeborn.master.slot.assign.maxWorkers`. | 0.3.1 | | | celeborn.client.spark.fetch.cleanFailedShuffle | false | false | whether to clean those disk space occupied by shuffles which cannot be fetched | 0.6.0 | | +| celeborn.client.spark.fetch.cleanFailedShuffle.runningStageManagerImpl | org.apache.spark.shuffle.celeborn.RunningStageManagerImpl | false | full class name of of running stage manager implementation, mainly for test | 0.6.0 | | | celeborn.client.spark.fetch.cleanFailedShuffleInterval | 1s | false | the interval to clean the failed-to-fetch shuffle files, only valid when celeborn.client.spark.fetch.cleanFailedShuffle is enabled | 0.6.0 | | | celeborn.client.spark.push.dynamicWriteMode.enabled | false | false | Whether to dynamically switch push write mode based on conditions.If true, shuffle mode will be only determined by partition count | 0.5.0 | | | celeborn.client.spark.push.dynamicWriteMode.partitionNum.threshold | 2000 | false | Threshold of shuffle partition number for dynamically switching push writer mode. When the shuffle partition number is greater than this value, use the sort-based shuffle writer for memory efficiency; otherwise use the hash-based shuffle writer for speed. This configuration only takes effect when celeborn.client.spark.push.dynamicWriteMode.enabled is true. | 0.5.0 | | From b0d9bb2b542f5282a9445dc8243f61639fbc4f40 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Wed, 30 Apr 2025 21:45:10 -0700 Subject: [PATCH 072/120] handle indeterminstic case --- .../celeborn/RunningStageManagerImpl.java | 23 +++++++++++++ .../celeborn/spark/FailedShuffleCleaner.scala | 5 +++ .../celeborn/spark/RunningStageManager.scala | 2 ++ .../CelebornFetchFailureDiskCleanSuite.scala | 33 ++++++++++++++++++- .../TestRunningStageManager.scala | 9 +++++ .../scheduler/SparkSchedulerHelper.scala | 2 ++ 6 files changed, 73 insertions(+), 1 deletion(-) diff --git a/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/RunningStageManagerImpl.java b/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/RunningStageManagerImpl.java index 8311c2e867a..d266c0d77c4 100644 --- a/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/RunningStageManagerImpl.java +++ b/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/RunningStageManagerImpl.java @@ -17,6 +17,7 @@ package org.apache.spark.shuffle.celeborn; +import java.util.HashMap; import java.util.HashSet; import scala.collection.JavaConverters; @@ -37,6 +38,8 @@ public class RunningStageManagerImpl implements RunningStageManager { DynFields.builder().hiddenImpl(Stage.class, "id").build(); private static final DynFields.UnboundField runningStages_FIELD = DynFields.builder().hiddenImpl(DAGScheduler.class, "runningStages").build(); + private static final DynFields.UnboundField stageIdToStage_FIELD = + DynFields.builder().hiddenImpl(DAGScheduler.class, "stageIdToStage").build(); private HashSet runningStages() { try { @@ -52,6 +55,20 @@ private HashSet runningStages() { } } + private HashMap stageIdToStageMap() { + try { + DAGScheduler dagScheduler = SparkContext$.MODULE$.getActive().get().dagScheduler(); + return new HashMap<>( + JavaConverters.mapAsJavaMapConverter( + (scala.collection.mutable.HashMap) + stageIdToStage_FIELD.bind(dagScheduler).get()) + .asJava()); + } catch (Exception e) { + LOG.error("cannot get running stages", e); + return new HashMap<>(); + } + } + public boolean isRunningStage(int stageId) { try { for (Object stage : runningStages()) { @@ -66,4 +83,10 @@ public boolean isRunningStage(int stageId) { return true; } } + + @Override + public boolean isDeterministicStage(int stageId) { + HashMap map = stageIdToStageMap(); + return ((Stage) map.get(stageId)).isIndeterminate(); + } } diff --git a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala index 870a7a829d6..98c3d3e5bf0 100644 --- a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala +++ b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala @@ -96,6 +96,7 @@ private[celeborn] object FailedShuffleCleaner extends Logging { if (!celebornShuffleIdToReferringStages.containsKey(celebornShuffleId) || onlyCurrentStageReferred(celebornShuffleId, stageId.toInt) || noRunningDownstreamStage(celebornShuffleId) + || !isDeterministicStage(stageId.toInt) || !committedSuccessfully(celebornShuffleId)) { shufflesToBeCleaned.put(celebornShuffleId) } @@ -148,6 +149,10 @@ private[celeborn] object FailedShuffleCleaner extends Logging { cleanedShuffleIds.remove(celebornShuffleId) } + private def isDeterministicStage(stageId: Int): Boolean = { + runningStageManager.isDeterministicStage(stageId) + } + private def noRunningDownstreamStage(celebornShuffleId: Int): Boolean = lock.synchronized { val allReferringStageIds = celebornShuffleIdToReferringStages.get(celebornShuffleId) require(allReferringStageIds != null, s"no stage referring to shuffle $celebornShuffleId") diff --git a/client-spark/common/src/main/scala/org/apache/celeborn/spark/RunningStageManager.scala b/client-spark/common/src/main/scala/org/apache/celeborn/spark/RunningStageManager.scala index b70851583c5..5ddc0b5579d 100644 --- a/client-spark/common/src/main/scala/org/apache/celeborn/spark/RunningStageManager.scala +++ b/client-spark/common/src/main/scala/org/apache/celeborn/spark/RunningStageManager.scala @@ -18,4 +18,6 @@ package org.apache.celeborn.spark trait RunningStageManager { def isRunningStage(stageId: Int): Boolean + + def isDeterministicStage(stageId: Int): Boolean } diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala index 8cfdc096aaa..0c878e8bb7f 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala @@ -24,7 +24,7 @@ import org.apache.celeborn.spark.FailedShuffleCleaner import org.apache.celeborn.tests.spark.fetch_failure.{FailCommitShuffleReaderGetHook, FetchFailureDiskCleanBase, FileDeletionShuffleReaderGetHook, TestRunningStageManager} class CelebornFetchFailureDiskCleanSuite extends FetchFailureDiskCleanBase { - + /* // 1. for single level 1-1 lineage, the old disk space is cleaned before the spark application // finish test("celeborn spark integration test - (1-1 dep with, single level lineage) the failed shuffle file is cleaned up correctly") { @@ -224,5 +224,36 @@ class CelebornFetchFailureDiskCleanSuite extends FetchFailureDiskCleanBase { assert(tuple.mkString("[", ",", "]").equals(expect)) sparkSession.stop() } + }*/ + + test("celeborn spark integration test - clean up the shuffle files if" + + " the upstream stage is indeterministic") { + if (Spark3OrNewer) { + TestRunningStageManager.runningStages += 2 + TestRunningStageManager.indeterministicStages += 0 + // create dummy running stages + FailedShuffleCleaner.celebornShuffleIdToReferringStages.put(0, new mutable.HashSet[Int]) + FailedShuffleCleaner.celebornShuffleIdToReferringStages.get(0) += 2 + val sparkSession = createSparkSession( + enableFailedShuffleCleaner = true, + enableCustomizedRunningStageMgr = true) + val celebornConf = SparkUtils.fromSparkConf(sparkSession.sparkContext.getConf) + val hook = new FileDeletionShuffleReaderGetHook( + celebornConf, + workerDirs, + shuffleIdToBeDeleted = Seq(0)) + TestCelebornShuffleManager.registerReaderGetHook(hook) + val checkingThread = + triggerStorageCheckThread(Seq(0), Seq(1), sparkSession, forStableStatusChecking = true) + import sparkSession.implicits._ + val df1 = Seq((1, "a"), (2, "b")).toDF("id", "data").groupBy("id").count() + val tuple = df1.collect().map(r => r.getAs[Int]("id")) + checkStorageValidation(checkingThread) + // verify result + assert(hook.executed.get()) + val expect = "[2,1]" + assert(tuple.mkString("[", ",", "]").equals(expect)) + sparkSession.stop() + } } } diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/TestRunningStageManager.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/TestRunningStageManager.scala index ec0aaf56139..685682949ca 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/TestRunningStageManager.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/TestRunningStageManager.scala @@ -32,8 +32,17 @@ class TestRunningStageManager extends RunningStageManager { SparkSchedulerHelper.runningStages.map(_.id).contains(stageId) } } + + override def isDeterministicStage(stageId: Int): Boolean = { + if (indeterministicStages.contains(stageId)) { + false + } else { + SparkSchedulerHelper.stageIdToStage.get(stageId).exists(_.isIndeterminate) + } + } } object TestRunningStageManager { val runningStages = new mutable.HashSet[Int] + val indeterministicStages = new mutable.HashSet[Int]() } diff --git a/tests/spark-it/src/test/scala/org/apache/spark/scheduler/SparkSchedulerHelper.scala b/tests/spark-it/src/test/scala/org/apache/spark/scheduler/SparkSchedulerHelper.scala index 0eafaee935d..83c5b5942ca 100644 --- a/tests/spark-it/src/test/scala/org/apache/spark/scheduler/SparkSchedulerHelper.scala +++ b/tests/spark-it/src/test/scala/org/apache/spark/scheduler/SparkSchedulerHelper.scala @@ -23,4 +23,6 @@ object SparkSchedulerHelper { def dagScheduler = SparkContext.getActive.get.dagScheduler def runningStages = dagScheduler.runningStages + + def stageIdToStage = dagScheduler.stageIdToStage } From 2395e26c0c307fd66c887f63ea9ae6bc4fbb7972 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Wed, 30 Apr 2025 21:46:41 -0700 Subject: [PATCH 073/120] resume tests --- .../tests/spark/CelebornFetchFailureDiskCleanSuite.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala index 0c878e8bb7f..9d9f39bce24 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala @@ -24,7 +24,7 @@ import org.apache.celeborn.spark.FailedShuffleCleaner import org.apache.celeborn.tests.spark.fetch_failure.{FailCommitShuffleReaderGetHook, FetchFailureDiskCleanBase, FileDeletionShuffleReaderGetHook, TestRunningStageManager} class CelebornFetchFailureDiskCleanSuite extends FetchFailureDiskCleanBase { - /* + // 1. for single level 1-1 lineage, the old disk space is cleaned before the spark application // finish test("celeborn spark integration test - (1-1 dep with, single level lineage) the failed shuffle file is cleaned up correctly") { @@ -224,7 +224,7 @@ class CelebornFetchFailureDiskCleanSuite extends FetchFailureDiskCleanBase { assert(tuple.mkString("[", ",", "]").equals(expect)) sparkSession.stop() } - }*/ + } test("celeborn spark integration test - clean up the shuffle files if" + " the upstream stage is indeterministic") { From b8961209787434b60744a854e308c13943f00a3f Mon Sep 17 00:00:00 2001 From: CodingCat Date: Wed, 30 Apr 2025 21:50:09 -0700 Subject: [PATCH 074/120] lint --- .../tests/spark/CelebornFetchFailureDiskCleanSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala index 9d9f39bce24..48cea8d9f3d 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala @@ -24,7 +24,7 @@ import org.apache.celeborn.spark.FailedShuffleCleaner import org.apache.celeborn.tests.spark.fetch_failure.{FailCommitShuffleReaderGetHook, FetchFailureDiskCleanBase, FileDeletionShuffleReaderGetHook, TestRunningStageManager} class CelebornFetchFailureDiskCleanSuite extends FetchFailureDiskCleanBase { - + // 1. for single level 1-1 lineage, the old disk space is cleaned before the spark application // finish test("celeborn spark integration test - (1-1 dep with, single level lineage) the failed shuffle file is cleaned up correctly") { From 0b39821dede58a842260e1284b9ee300c9dbc0cc Mon Sep 17 00:00:00 2001 From: CodingCat Date: Wed, 30 Apr 2025 22:38:11 -0700 Subject: [PATCH 075/120] fix typos --- .../shuffle/celeborn/RunningStageManagerImpl.java | 15 ++++++++++++++- .../celeborn/spark/FailedShuffleCleaner.scala | 2 +- .../fetch_failure/TestRunningStageManager.scala | 2 +- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/RunningStageManagerImpl.java b/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/RunningStageManagerImpl.java index d266c0d77c4..9f5a593ce1b 100644 --- a/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/RunningStageManagerImpl.java +++ b/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/RunningStageManagerImpl.java @@ -17,6 +17,7 @@ package org.apache.spark.shuffle.celeborn; +import java.lang.reflect.Method; import java.util.HashMap; import java.util.HashSet; @@ -87,6 +88,18 @@ public boolean isRunningStage(int stageId) { @Override public boolean isDeterministicStage(int stageId) { HashMap map = stageIdToStageMap(); - return ((Stage) map.get(stageId)).isIndeterminate(); + Object stage = map.get(stageId); + try { + Method isIndeterminateMethod = stage.getClass().getMethod("isIndeterminate"); + boolean isIndeterminate = (boolean) isIndeterminateMethod.invoke(stage); + System.out.println("returning " + isIndeterminate); + return !isIndeterminate; + } catch (NoSuchMethodException e) { + System.out.println("Method isIndeterminate not found on stage object: " + e.getMessage()); + return true; + } catch (Exception e) { + System.out.println("Error invoking isIndeterminate: " + e.getMessage()); + return true; + } } } diff --git a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala index 98c3d3e5bf0..cc72693d745 100644 --- a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala +++ b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala @@ -94,9 +94,9 @@ private[celeborn] object FailedShuffleCleaner extends Logging { lifecycleManager.get().getShuffleIdMapping.get(appShuffleId.toInt).foreach { case (_, (celebornShuffleId, _)) => { if (!celebornShuffleIdToReferringStages.containsKey(celebornShuffleId) + || !isDeterministicStage(stageId.toInt) || onlyCurrentStageReferred(celebornShuffleId, stageId.toInt) || noRunningDownstreamStage(celebornShuffleId) - || !isDeterministicStage(stageId.toInt) || !committedSuccessfully(celebornShuffleId)) { shufflesToBeCleaned.put(celebornShuffleId) } diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/TestRunningStageManager.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/TestRunningStageManager.scala index 685682949ca..b431dcedaa1 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/TestRunningStageManager.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/TestRunningStageManager.scala @@ -37,7 +37,7 @@ class TestRunningStageManager extends RunningStageManager { if (indeterministicStages.contains(stageId)) { false } else { - SparkSchedulerHelper.stageIdToStage.get(stageId).exists(_.isIndeterminate) + !SparkSchedulerHelper.stageIdToStage.get(stageId).exists(_.isIndeterminate) } } } From fb5a84e086543adb3ecb5367c33bfd6eeaeb4ba1 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Wed, 30 Apr 2025 22:49:37 -0700 Subject: [PATCH 076/120] fix spark 2 --- .../spark/shuffle/celeborn/RunningStageManagerImpl.java | 5 ++--- .../tests/spark/fetch_failure/TestRunningStageManager.scala | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/RunningStageManagerImpl.java b/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/RunningStageManagerImpl.java index 9f5a593ce1b..3a1c667ab8e 100644 --- a/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/RunningStageManagerImpl.java +++ b/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/RunningStageManagerImpl.java @@ -92,13 +92,12 @@ public boolean isDeterministicStage(int stageId) { try { Method isIndeterminateMethod = stage.getClass().getMethod("isIndeterminate"); boolean isIndeterminate = (boolean) isIndeterminateMethod.invoke(stage); - System.out.println("returning " + isIndeterminate); return !isIndeterminate; } catch (NoSuchMethodException e) { - System.out.println("Method isIndeterminate not found on stage object: " + e.getMessage()); + LOG.warn("Method isIndeterminate not found on stage object: " + e.getMessage()); return true; } catch (Exception e) { - System.out.println("Error invoking isIndeterminate: " + e.getMessage()); + LOG.warn("Error invoking isIndeterminate: " + e.getMessage()); return true; } } diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/TestRunningStageManager.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/TestRunningStageManager.scala index b431dcedaa1..41d226ed3a9 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/TestRunningStageManager.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/TestRunningStageManager.scala @@ -37,7 +37,7 @@ class TestRunningStageManager extends RunningStageManager { if (indeterministicStages.contains(stageId)) { false } else { - !SparkSchedulerHelper.stageIdToStage.get(stageId).exists(_.isIndeterminate) + true } } } From 4e1aa67d3cbf8c7e833e2f732e5676acec56516f Mon Sep 17 00:00:00 2001 From: CodingCat Date: Thu, 1 May 2025 08:04:47 -0700 Subject: [PATCH 077/120] change debugging string --- .../tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala index df7703b0bf7..ec2d278051f 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala @@ -81,7 +81,8 @@ private[tests] trait FetchFailureDiskCleanBase extends AnyFunSuite new File(s"$dir/celeborn-worker/shuffle_data/" + s"${sparkSession.sparkContext.applicationId}/$shuffleId").exists()).toList }).mkString(",") - println(s"${deletedSuccessfullyString} \t $createdSuccessfullyString") + println(s"shuffle-to-be-deleted status: $deletedSuccessfullyString \n" + + s"shuffle-to-be-created status: $createdSuccessfullyString") deletedSuccessfully && createdSuccessfully } From 71519929935420a4403bc80b068727f357227859 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Tue, 6 May 2025 13:44:29 -0700 Subject: [PATCH 078/120] simplify code --- .../celeborn/spark/FailedShuffleCleaner.scala | 68 +--------- .../shuffle/celeborn/SparkShuffleManager.java | 4 - .../spark/shuffle/celeborn/SparkUtils.java | 6 - .../celeborn/client/LifecycleManager.scala | 7 -- .../CelebornFailedDiskCleanUtilsSuite.scala | 61 --------- .../CelebornFetchFailureDiskCleanSuite.scala | 117 ------------------ 6 files changed, 3 insertions(+), 260 deletions(-) delete mode 100644 tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFailedDiskCleanUtilsSuite.scala diff --git a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala index cc72693d745..bc5be2bef46 100644 --- a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala +++ b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala @@ -23,7 +23,7 @@ import java.util.concurrent.atomic.AtomicReference import scala.collection.JavaConverters._ import scala.collection.mutable -import org.apache.spark.shuffle.celeborn.{RunningStageManagerImpl, SparkCommonUtils} +import org.apache.spark.shuffle.celeborn.SparkCommonUtils import org.apache.celeborn.client.LifecycleManager import org.apache.celeborn.common.internal.Logging @@ -35,17 +35,12 @@ private[celeborn] object FailedShuffleCleaner extends Logging { // in celeborn ids private val shufflesToBeCleaned = new LinkedBlockingQueue[Int]() private val cleanedShuffleIds = new mutable.HashSet[Int] - // celeborn shuffle id to stage id referred to it - private[celeborn] val celebornShuffleIdToReferringStages = - new ConcurrentHashMap[Int, mutable.HashSet[Int]]() private val lock = new Object private lazy val cleanInterval = lifecycleManager.get().conf.clientFetchCleanFailedShuffleIntervalMS - private[celeborn] var runningStageManager: RunningStageManager = _ - private def buildRunningStageChecker(): RunningStageManager = { val lifecycleMgrRef = lifecycleManager.get() if (lifecycleMgrRef != null) { @@ -61,8 +56,6 @@ private[celeborn] object FailedShuffleCleaner extends Logging { def reset(): Unit = { shufflesToBeCleaned.clear() cleanedShuffleIds.clear() - celebornShuffleIdToReferringStages.clear() - runningStageManager = buildRunningStageChecker() if (cleanerThreadPool != null) { cleanerThreadPool.shutdownNow() cleanerThreadPool = null @@ -70,55 +63,19 @@ private[celeborn] object FailedShuffleCleaner extends Logging { lifecycleManager.set(null) } - def addShuffleIdReferringStage(celebornShuffleId: Int, appShuffleIdentifier: String): Unit = { - val Array(_, stageId, _) = SparkCommonUtils.decodeAppShuffleIdentifier(appShuffleIdentifier) - celebornShuffleIdToReferringStages.putIfAbsent(celebornShuffleId, new mutable.HashSet[Int]) - lock.synchronized { - celebornShuffleIdToReferringStages.get(celebornShuffleId).add(stageId.toInt) - } - } - - private def onlyCurrentStageReferred(celebornShuffleId: Int, stageId: Int): Boolean = - lock.synchronized { - val ret = celebornShuffleIdToReferringStages.get(celebornShuffleId).size == 1 && - celebornShuffleIdToReferringStages.get(celebornShuffleId).contains(stageId) - if (ret) { - logInfo(s"only stage $stageId refers to shuffle $celebornShuffleId, adding for clean up") - } - ret - } - def addShuffleIdToBeCleaned(appShuffleIdentifier: String): Unit = { - val Array(appShuffleId, stageId, _) = SparkCommonUtils.decodeAppShuffleIdentifier( + val Array(appShuffleId, _, _) = SparkCommonUtils.decodeAppShuffleIdentifier( appShuffleIdentifier) lifecycleManager.get().getShuffleIdMapping.get(appShuffleId.toInt).foreach { - case (_, (celebornShuffleId, _)) => { - if (!celebornShuffleIdToReferringStages.containsKey(celebornShuffleId) - || !isDeterministicStage(stageId.toInt) - || onlyCurrentStageReferred(celebornShuffleId, stageId.toInt) - || noRunningDownstreamStage(celebornShuffleId) - || !committedSuccessfully(celebornShuffleId)) { - shufflesToBeCleaned.put(celebornShuffleId) - } - } + case (_, (celebornShuffleId, _)) => shufflesToBeCleaned.put(celebornShuffleId) } } - private def committedSuccessfully(celebornShuffleId: Int): Boolean = { - val ret = !lifecycleManager.get().commitManager.getCommitHandler(celebornShuffleId) - .isStageDataLost(celebornShuffleId) - if (!ret) { - logInfo(s"shuffle $celebornShuffleId is failed to commit, adding for cleaning up") - } - ret - } - def setLifecycleManager(ref: LifecycleManager): Unit = { val firstSet = lifecycleManager.compareAndSet(null, ref) if (firstSet) { cleanerThreadPool = ThreadUtils.newDaemonSingleThreadScheduledExecutor( "failedShuffleCleanerThreadPool") - runningStageManager = buildRunningStageChecker() cleanerThreadPool.scheduleWithFixedDelay( new Runnable { override def run(): Unit = { @@ -149,24 +106,5 @@ private[celeborn] object FailedShuffleCleaner extends Logging { cleanedShuffleIds.remove(celebornShuffleId) } - private def isDeterministicStage(stageId: Int): Boolean = { - runningStageManager.isDeterministicStage(stageId) - } - - private def noRunningDownstreamStage(celebornShuffleId: Int): Boolean = lock.synchronized { - val allReferringStageIds = celebornShuffleIdToReferringStages.get(celebornShuffleId) - require(allReferringStageIds != null, s"no stage referring to shuffle $celebornShuffleId") - val ret = - allReferringStageIds.count(stageId => runningStageManager.isRunningStage(stageId)) == 0 - if (ret) { - logInfo(s"no running downstream stages refers to $celebornShuffleId") - } else { - logInfo( - s"there is more than one running downstream stage referring to shuffle $celebornShuffleId," + - s" ignore it for cleanup ") - } - ret - } - private var cleanerThreadPool: ScheduledExecutorService = _ } diff --git a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java index 6881009b001..9327a8dc059 100644 --- a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java +++ b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java @@ -171,10 +171,6 @@ private void initializeLifecycleManager(String appId) { (appShuffleIdentifier) -> SparkUtils.addWriterShuffleIdsToBeCleaned( lifecycleManager, appShuffleIdentifier)); - lifecycleManager.registerRecordShuffleIdReferenceCallback( - (celebornShuffleId, appShuffleIdentifier) -> - SparkUtils.addShuffleIdRefStage( - lifecycleManager, celebornShuffleId, appShuffleIdentifier)); lifecycleManager.registerUnregisterShuffleCallback( (celebornShuffleId) -> SparkUtils.removeCleanedShuffleId(lifecycleManager, celebornShuffleId)); diff --git a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java index 42a0c8207aa..e285be25a18 100644 --- a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java +++ b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java @@ -632,12 +632,6 @@ public static void addWriterShuffleIdsToBeCleaned( FailedShuffleCleaner.addShuffleIdToBeCleaned(appShuffleIdentifier); } - public static void addShuffleIdRefStage( - LifecycleManager lifecycleManager, int celebornShuffleId, String appShuffleIdentifier) { - FailedShuffleCleaner.setLifecycleManager(lifecycleManager); - FailedShuffleCleaner.addShuffleIdReferringStage(celebornShuffleId, appShuffleIdentifier); - } - public static void removeCleanedShuffleId( LifecycleManager lifecycleManager, int celebornShuffleId) { FailedShuffleCleaner.setLifecycleManager(lifecycleManager); diff --git a/client/src/main/scala/org/apache/celeborn/client/LifecycleManager.scala b/client/src/main/scala/org/apache/celeborn/client/LifecycleManager.scala index d9436152652..a9685623d6f 100644 --- a/client/src/main/scala/org/apache/celeborn/client/LifecycleManager.scala +++ b/client/src/main/scala/org/apache/celeborn/client/LifecycleManager.scala @@ -954,8 +954,6 @@ class LifecycleManager(val appUniqueId: String, val conf: CelebornConf) extends shuffleIds.values.filter(v => v._2).map(v => v._1).toSeq.reverse.find( areAllMapTasksEnd) match { case Some(celebornShuffleId) => - recordShuffleIdReference.foreach(callback => - callback.accept(celebornShuffleId, appShuffleIdentifier)) val pbGetShuffleIdResponse = { logDebug( s"get shuffleId $celebornShuffleId for appShuffleId $appShuffleId appShuffleIdentifier $appShuffleIdentifier isWriter $isWriter") @@ -1863,11 +1861,6 @@ class LifecycleManager(val appUniqueId: String, val conf: CelebornConf) extends callback: Consumer[String]): Unit = { validateCelebornShuffleIdForClean = Some(callback) } - // expecting celeborn shuffle id and application shuffle identifier - @volatile private var recordShuffleIdReference: Option[BiConsumer[Integer, String]] = None - def registerRecordShuffleIdReferenceCallback(callback: BiConsumer[Integer, String]): Unit = { - recordShuffleIdReference = Some(callback) - } @volatile private var unregisterShuffleCallback: Option[Consumer[Integer]] = None def registerUnregisterShuffleCallback(callback: Consumer[Integer]): Unit = { diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFailedDiskCleanUtilsSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFailedDiskCleanUtilsSuite.scala deleted file mode 100644 index 4361a1a84dd..00000000000 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFailedDiskCleanUtilsSuite.scala +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.celeborn.tests.spark - -import org.scalatest.concurrent.Eventually.eventually -import org.scalatest.concurrent.Futures.{interval, timeout} -import org.scalatest.time.SpanSugar.convertIntToGrainOfTime - -import org.apache.celeborn.spark.FailedShuffleCleaner -import org.apache.celeborn.tests.spark.fetch_failure.FetchFailureDiskCleanBase - -class CelebornFailedDiskCleanUtilsSuite extends FetchFailureDiskCleanBase { - test("test correctness of RunningStageManager") { - if (Spark3OrNewer) { - val sparkSession = createSparkSession(enableFailedShuffleCleaner = true) - sparkSession.sparkContext.parallelize(List(1, 2, 3)).repartition(1).count() - try { - val t = new Thread { - override def run(): Unit = { - try { - sparkSession.sparkContext.parallelize(List(1, 2, 3)).mapPartitions { iter => - Thread.sleep(60 * 1000) - iter - }.collect() - } catch { - case _: InterruptedException => - } - } - } - t.start() - - eventually(timeout(20.seconds), interval(100.milliseconds)) { - assert(FailedShuffleCleaner.runningStageManager.isRunningStage(2)) - } - - sparkSession.sparkContext.cancelAllJobs() - t.interrupt() - - eventually(timeout(10.seconds), interval(100.milliseconds)) { - assert(!FailedShuffleCleaner.runningStageManager.isRunningStage(2)) - } - } finally { - sparkSession.stop() - } - } - } -} diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala index 48cea8d9f3d..74817b4fa22 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala @@ -139,121 +139,4 @@ class CelebornFetchFailureDiskCleanSuite extends FetchFailureDiskCleanBase { sparkSession.stop() } } - - // 7. if the dependency is 1 to M , we should not clean it - test("celeborn spark integration test - Do not clean up the shuffle files being referred by more than one stages") { - if (Spark3OrNewer) { - // create dummy running stages - TestRunningStageManager.runningStages += 2 - FailedShuffleCleaner.celebornShuffleIdToReferringStages.put(0, new mutable.HashSet[Int]) - FailedShuffleCleaner.celebornShuffleIdToReferringStages.get(0) += 2 - val sparkSession = createSparkSession( - enableFailedShuffleCleaner = true, - enableCustomizedRunningStageMgr = true) - val celebornConf = SparkUtils.fromSparkConf(sparkSession.sparkContext.getConf) - val hook = new FileDeletionShuffleReaderGetHook( - celebornConf, - workerDirs, - shuffleIdToBeDeleted = Seq(0)) - TestCelebornShuffleManager.registerReaderGetHook(hook) - val checkingThread = - triggerStorageCheckThread(Seq(), Seq(0, 1), sparkSession, forStableStatusChecking = true) - import sparkSession.implicits._ - val df1 = Seq((1, "a"), (2, "b")).toDF("id", "data").groupBy("id").count() - val tuple = df1.collect().map(r => r.getAs[Int]("id")) - checkStorageValidation(checkingThread) - // verify result - assert(hook.executed.get()) - val expect = "[2,1]" - assert(tuple.mkString("[", ",", "]").equals(expect)) - sparkSession.stop() - } - } - - // 8. if the dependency is 1 to M but failed in commit phase, we should just clean it - test("celeborn spark integration test - clear the failed-to-commit shuffle file even it is referred by more than once") { - if (Spark3OrNewer) { - // create dummy running stages - TestRunningStageManager.runningStages += 2 - FailedShuffleCleaner.celebornShuffleIdToReferringStages.put(0, new mutable.HashSet[Int]) - FailedShuffleCleaner.celebornShuffleIdToReferringStages.get(0) += 2 - val sparkSession = createSparkSession( - enableFailedShuffleCleaner = true, - enableCustomizedRunningStageMgr = true) - val celebornConf = SparkUtils.fromSparkConf(sparkSession.sparkContext.getConf) - val hook = new FailCommitShuffleReaderGetHook(celebornConf) - TestCelebornShuffleManager.registerReaderGetHook(hook) - val checkingThread = - triggerStorageCheckThread(Seq(0, 2), Seq(1), sparkSession, forStableStatusChecking = true) - import sparkSession.implicits._ - val df1 = Seq((1, "a"), (2, "b")).toDF("id", "data").groupBy("id").count() - val tuples = df1.collect().map(r => r.getAs[Int]("id")) - checkStorageValidation(checkingThread) - // verify result - assert(hook.executed.get()) - val expect = "[2,1]" - assert(tuples.mkString("[", ",", "]").equals(expect)) - sparkSession.stop() - } - } - - test("celeborn spark integration test - clean up the shuffle files if" + - " the referring stage has finished") { - if (Spark3OrNewer) { - // create dummy running stages - FailedShuffleCleaner.celebornShuffleIdToReferringStages.put(0, new mutable.HashSet[Int]) - FailedShuffleCleaner.celebornShuffleIdToReferringStages.get(0) += 2 - val sparkSession = createSparkSession( - enableFailedShuffleCleaner = true, - enableCustomizedRunningStageMgr = true) - val celebornConf = SparkUtils.fromSparkConf(sparkSession.sparkContext.getConf) - val hook = new FileDeletionShuffleReaderGetHook( - celebornConf, - workerDirs, - shuffleIdToBeDeleted = Seq(0)) - TestCelebornShuffleManager.registerReaderGetHook(hook) - val checkingThread = - triggerStorageCheckThread(Seq(), Seq(1), sparkSession, forStableStatusChecking = true) - import sparkSession.implicits._ - val df1 = Seq((1, "a"), (2, "b")).toDF("id", "data").groupBy("id").count() - val tuple = df1.collect().map(r => r.getAs[Int]("id")) - checkStorageValidation(checkingThread) - // verify result - assert(hook.executed.get()) - val expect = "[2,1]" - assert(tuple.mkString("[", ",", "]").equals(expect)) - sparkSession.stop() - } - } - - test("celeborn spark integration test - clean up the shuffle files if" + - " the upstream stage is indeterministic") { - if (Spark3OrNewer) { - TestRunningStageManager.runningStages += 2 - TestRunningStageManager.indeterministicStages += 0 - // create dummy running stages - FailedShuffleCleaner.celebornShuffleIdToReferringStages.put(0, new mutable.HashSet[Int]) - FailedShuffleCleaner.celebornShuffleIdToReferringStages.get(0) += 2 - val sparkSession = createSparkSession( - enableFailedShuffleCleaner = true, - enableCustomizedRunningStageMgr = true) - val celebornConf = SparkUtils.fromSparkConf(sparkSession.sparkContext.getConf) - val hook = new FileDeletionShuffleReaderGetHook( - celebornConf, - workerDirs, - shuffleIdToBeDeleted = Seq(0)) - TestCelebornShuffleManager.registerReaderGetHook(hook) - val checkingThread = - triggerStorageCheckThread(Seq(0), Seq(1), sparkSession, forStableStatusChecking = true) - import sparkSession.implicits._ - val df1 = Seq((1, "a"), (2, "b")).toDF("id", "data").groupBy("id").count() - val tuple = df1.collect().map(r => r.getAs[Int]("id")) - checkStorageValidation(checkingThread) - // verify result - assert(hook.executed.get()) - val expect = "[2,1]" - assert(tuple.mkString("[", ",", "]").equals(expect)) - sparkSession.stop() - } - } } From 018c75e66248eb53cb11a6301231db5dcb39a126 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Fri, 9 May 2025 16:27:46 -0700 Subject: [PATCH 079/120] addr comments --- .../celeborn/RunningStageManagerImpl.java | 104 ------------------ .../celeborn/spark/FailedShuffleCleaner.scala | 11 -- .../celeborn/spark/RunningStageManager.scala | 23 ---- .../apache/celeborn/common/CelebornConf.scala | 2 - .../CelebornFetchFailureDiskCleanSuite.scala | 5 +- .../TestRunningStageManager.scala | 48 -------- 6 files changed, 1 insertion(+), 192 deletions(-) delete mode 100644 client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/RunningStageManagerImpl.java delete mode 100644 client-spark/common/src/main/scala/org/apache/celeborn/spark/RunningStageManager.scala delete mode 100644 tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/TestRunningStageManager.scala diff --git a/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/RunningStageManagerImpl.java b/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/RunningStageManagerImpl.java deleted file mode 100644 index 3a1c667ab8e..00000000000 --- a/client-spark/common/src/main/java/org/apache/spark/shuffle/celeborn/RunningStageManagerImpl.java +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.shuffle.celeborn; - -import java.lang.reflect.Method; -import java.util.HashMap; -import java.util.HashSet; - -import scala.collection.JavaConverters; - -import org.apache.spark.SparkContext$; -import org.apache.spark.scheduler.DAGScheduler; -import org.apache.spark.scheduler.Stage; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import org.apache.celeborn.reflect.DynFields; -import org.apache.celeborn.spark.RunningStageManager; - -public class RunningStageManagerImpl implements RunningStageManager { - - private static final Logger LOG = LoggerFactory.getLogger(RunningStageManagerImpl.class); - private static final DynFields.UnboundField id_FIELD = - DynFields.builder().hiddenImpl(Stage.class, "id").build(); - private static final DynFields.UnboundField runningStages_FIELD = - DynFields.builder().hiddenImpl(DAGScheduler.class, "runningStages").build(); - private static final DynFields.UnboundField stageIdToStage_FIELD = - DynFields.builder().hiddenImpl(DAGScheduler.class, "stageIdToStage").build(); - - private HashSet runningStages() { - try { - DAGScheduler dagScheduler = SparkContext$.MODULE$.getActive().get().dagScheduler(); - return new HashSet<>( - JavaConverters.asJavaCollectionConverter( - (scala.collection.mutable.HashSet) - runningStages_FIELD.bind(dagScheduler).get()) - .asJavaCollection()); - } catch (Exception e) { - LOG.error("cannot get running stages", e); - return new HashSet<>(); - } - } - - private HashMap stageIdToStageMap() { - try { - DAGScheduler dagScheduler = SparkContext$.MODULE$.getActive().get().dagScheduler(); - return new HashMap<>( - JavaConverters.mapAsJavaMapConverter( - (scala.collection.mutable.HashMap) - stageIdToStage_FIELD.bind(dagScheduler).get()) - .asJava()); - } catch (Exception e) { - LOG.error("cannot get running stages", e); - return new HashMap<>(); - } - } - - public boolean isRunningStage(int stageId) { - try { - for (Object stage : runningStages()) { - int currentStageId = (Integer) id_FIELD.bind(stage).get(); - if (currentStageId == stageId) { - return true; - } - } - return false; - } catch (Exception e) { - LOG.error("unexpected exception when checking whether it is running stage ", e); - return true; - } - } - - @Override - public boolean isDeterministicStage(int stageId) { - HashMap map = stageIdToStageMap(); - Object stage = map.get(stageId); - try { - Method isIndeterminateMethod = stage.getClass().getMethod("isIndeterminate"); - boolean isIndeterminate = (boolean) isIndeterminateMethod.invoke(stage); - return !isIndeterminate; - } catch (NoSuchMethodException e) { - LOG.warn("Method isIndeterminate not found on stage object: " + e.getMessage()); - return true; - } catch (Exception e) { - LOG.warn("Error invoking isIndeterminate: " + e.getMessage()); - return true; - } - } -} diff --git a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala index bc5be2bef46..0afc5f24f93 100644 --- a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala +++ b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala @@ -41,17 +41,6 @@ private[celeborn] object FailedShuffleCleaner extends Logging { private lazy val cleanInterval = lifecycleManager.get().conf.clientFetchCleanFailedShuffleIntervalMS - private def buildRunningStageChecker(): RunningStageManager = { - val lifecycleMgrRef = lifecycleManager.get() - if (lifecycleMgrRef != null) { - val className = lifecycleManager.get().conf.clientFetchCleanFailedShuffleRunningMgrImpl - val claz = Class.forName(className) - claz.getDeclaredConstructor().newInstance().asInstanceOf[RunningStageManager] - } else { - null - } - } - // for test def reset(): Unit = { shufflesToBeCleaned.clear() diff --git a/client-spark/common/src/main/scala/org/apache/celeborn/spark/RunningStageManager.scala b/client-spark/common/src/main/scala/org/apache/celeborn/spark/RunningStageManager.scala deleted file mode 100644 index 5ddc0b5579d..00000000000 --- a/client-spark/common/src/main/scala/org/apache/celeborn/spark/RunningStageManager.scala +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.celeborn.spark - -trait RunningStageManager { - def isRunningStage(stageId: Int): Boolean - - def isDeterministicStage(stageId: Int): Boolean -} diff --git a/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala b/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala index 14ec4637329..bd094fb04d4 100644 --- a/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala +++ b/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala @@ -1000,8 +1000,6 @@ class CelebornConf(loadDefaults: Boolean) extends Cloneable with Logging with Se def clientFetchCleanFailedShuffle: Boolean = get(CLIENT_FETCH_CLEAN_FAILED_SHUFFLE) def clientFetchCleanFailedShuffleIntervalMS: Long = get(CLIENT_FETCH_CLEAN_FAILED_SHUFFLE_INTERVAL) - def clientFetchCleanFailedShuffleRunningMgrImpl: String = - get(CLIENT_FETCH_CLEAN_FAILED_SHUFFLE_RUNNING_STAGE_MGR_IMPL) def clientFetchExcludeWorkerOnFailureEnabled: Boolean = get(CLIENT_FETCH_EXCLUDE_WORKER_ON_FAILURE_ENABLED) def clientFetchExcludedWorkerExpireTimeout: Long = diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala index 74817b4fa22..f80c2a6c9cc 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala @@ -16,12 +16,9 @@ */ package org.apache.celeborn.tests.spark -import scala.collection.mutable - import org.apache.spark.shuffle.celeborn.{SparkUtils, TestCelebornShuffleManager} -import org.apache.celeborn.spark.FailedShuffleCleaner -import org.apache.celeborn.tests.spark.fetch_failure.{FailCommitShuffleReaderGetHook, FetchFailureDiskCleanBase, FileDeletionShuffleReaderGetHook, TestRunningStageManager} +import org.apache.celeborn.tests.spark.fetch_failure.{FetchFailureDiskCleanBase, FileDeletionShuffleReaderGetHook} class CelebornFetchFailureDiskCleanSuite extends FetchFailureDiskCleanBase { diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/TestRunningStageManager.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/TestRunningStageManager.scala deleted file mode 100644 index 41d226ed3a9..00000000000 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/TestRunningStageManager.scala +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.celeborn.tests.spark.fetch_failure - -import scala.collection.mutable - -import org.apache.spark.scheduler.SparkSchedulerHelper - -import org.apache.celeborn.spark.RunningStageManager - -class TestRunningStageManager extends RunningStageManager { - import TestRunningStageManager._ - - override def isRunningStage(stageId: Int): Boolean = { - if (runningStages.contains(stageId)) { - true - } else { - SparkSchedulerHelper.runningStages.map(_.id).contains(stageId) - } - } - - override def isDeterministicStage(stageId: Int): Boolean = { - if (indeterministicStages.contains(stageId)) { - false - } else { - true - } - } -} - -object TestRunningStageManager { - val runningStages = new mutable.HashSet[Int] - val indeterministicStages = new mutable.HashSet[Int]() -} From 43e50c64ae7275ecdf66d843823b9d187566ae28 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Fri, 9 May 2025 18:05:40 -0700 Subject: [PATCH 080/120] 4 mins --- .../tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala index ec2d278051f..a2f3d7909b6 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala @@ -141,7 +141,7 @@ private[tests] trait FetchFailureDiskCleanBase extends AnyFunSuite checkingThread } - protected def checkStorageValidation(thread: Thread, timeout: Long = 120 * 1000): Unit = { + protected def checkStorageValidation(thread: Thread, timeout: Long = 240 * 1000): Unit = { val checkingThread = thread.asInstanceOf[CheckingThread] checkingThread.join(timeout) if (checkingThread.isAlive || checkingThread.exception != null) { From 3025a396527ce50d1f1158ee20f559d60ccbd300 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Fri, 9 May 2025 19:01:06 -0700 Subject: [PATCH 081/120] avoid driver oom --- .../tests/spark/fetch_failure/FetchFailureTestBase.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureTestBase.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureTestBase.scala index 1f2f84a4bd4..4d9d1efb50d 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureTestBase.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureTestBase.scala @@ -39,6 +39,7 @@ private[tests] trait FetchFailureTestBase extends SparkTestBase { var baseBuilder = SparkSession.builder() .config(updateSparkConf(sparkConf, ShuffleMode.HASH)) .config("spark.sql.shuffle.partitions", 2) + .config("spark.driver.memory", "4g") .config("spark.celeborn.shuffle.forceFallback.partition.enabled", false) .config("spark.celeborn.shuffle.enabled", "true") .config("spark.celeborn.client.shuffle.expired.checkInterval", "1s") From c8ed30a3d8f9fd62462fdc4da0f59d0343dfdd1a Mon Sep 17 00:00:00 2001 From: CodingCat Date: Fri, 9 May 2025 19:34:04 -0700 Subject: [PATCH 082/120] 16g? --- .../tests/spark/fetch_failure/FetchFailureTestBase.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureTestBase.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureTestBase.scala index 4d9d1efb50d..68e39ccd982 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureTestBase.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureTestBase.scala @@ -39,7 +39,7 @@ private[tests] trait FetchFailureTestBase extends SparkTestBase { var baseBuilder = SparkSession.builder() .config(updateSparkConf(sparkConf, ShuffleMode.HASH)) .config("spark.sql.shuffle.partitions", 2) - .config("spark.driver.memory", "4g") + .config("spark.driver.memory", "16g") .config("spark.celeborn.shuffle.forceFallback.partition.enabled", false) .config("spark.celeborn.shuffle.enabled", "true") .config("spark.celeborn.client.shuffle.expired.checkInterval", "1s") From 68ed11e0cdd6b5535f9f4ec497c6f2134efd84ca Mon Sep 17 00:00:00 2001 From: CodingCat Date: Fri, 9 May 2025 21:20:36 -0700 Subject: [PATCH 083/120] change --- pom.xml | 4 ++-- .../tests/spark/fetch_failure/FetchFailureTestBase.scala | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pom.xml b/pom.xml index 31a38cda12a..7d8cd2b36df 100644 --- a/pom.xml +++ b/pom.xml @@ -907,7 +907,7 @@ file:src/test/resources/log4j.properties src/test/resources/log4j2-test.xml ${project.build.directory}/tmp - 1g + 4g ${spark.shuffle.plugin.class} @@ -946,7 +946,7 @@ file:src/test/resources/log4j.properties src/test/resources/log4j2-test.xml ${project.build.directory}/tmp - 1g + 4g ${spark.shuffle.plugin.class} diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureTestBase.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureTestBase.scala index 68e39ccd982..4d9d1efb50d 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureTestBase.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureTestBase.scala @@ -39,7 +39,7 @@ private[tests] trait FetchFailureTestBase extends SparkTestBase { var baseBuilder = SparkSession.builder() .config(updateSparkConf(sparkConf, ShuffleMode.HASH)) .config("spark.sql.shuffle.partitions", 2) - .config("spark.driver.memory", "16g") + .config("spark.driver.memory", "4g") .config("spark.celeborn.shuffle.forceFallback.partition.enabled", false) .config("spark.celeborn.shuffle.enabled", "true") .config("spark.celeborn.client.shuffle.expired.checkInterval", "1s") From 6f1a7fe8d4c278a76aac220d79362b4bf2568ae6 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Sat, 10 May 2025 11:44:47 -0700 Subject: [PATCH 084/120] smaller test data? --- .../spark/CelebornFetchFailureDiskCleanSuite.scala | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala index f80c2a6c9cc..35b6bb6fd6a 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala @@ -33,15 +33,15 @@ class CelebornFetchFailureDiskCleanSuite extends FetchFailureDiskCleanBase { workerDirs, shuffleIdToBeDeleted = Seq(0)) TestCelebornShuffleManager.registerReaderGetHook(hook) - val value = Range(1, 10000).mkString(",") + val value = Range(1, 32).mkString(",") val checkingThread = triggerStorageCheckThread(Seq(0), Seq(1), sparkSession, forStableStatusChecking = false) - val tuples = sparkSession.sparkContext.parallelize(1 to 10000, 2) + val tuples = sparkSession.sparkContext.parallelize(1 to 32, 2) .map { i => (i, value) }.groupByKey(16).collect() checkStorageValidation(checkingThread) // verify result assert(hook.executed.get()) - assert(tuples.length == 10000) + assert(tuples.length == 32) for (elem <- tuples) { assert(elem._2.mkString(",").equals(value)) } @@ -61,13 +61,13 @@ class CelebornFetchFailureDiskCleanSuite extends FetchFailureDiskCleanBase { shuffleIdToBeDeleted = Seq(0, 1), triggerStageId = Some(2)) TestCelebornShuffleManager.registerReaderGetHook(hook) - val value = Range(1, 10000).mkString(",") + val value = Range(1, 32).mkString(",") val checkingThread = triggerStorageCheckThread( Seq(0, 1), Seq(2, 3, 4), sparkSession, forStableStatusChecking = false) - val tuples = sparkSession.sparkContext.parallelize(1 to 10000, 2) + val tuples = sparkSession.sparkContext.parallelize(1 to 32, 2) .map { i => (i, value) }.groupByKey(16).map { case (k, elements) => (k, elements.map(str => str.toLowerCase)) @@ -75,7 +75,7 @@ class CelebornFetchFailureDiskCleanSuite extends FetchFailureDiskCleanBase { checkStorageValidation(checkingThread) // verify result assert(hook.executed.get()) - assert(tuples.length == 10000) + assert(tuples.length == 32) for (elem <- tuples) { assert(elem._2.flatten.flatten.mkString(",").equals(value)) } From 5e2b5079d073c62875ee50513916a40d56bb9b90 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Sat, 10 May 2025 12:55:39 -0700 Subject: [PATCH 085/120] recover test data size to ensure enough partitions --- .../spark/CelebornFetchFailureDiskCleanSuite.scala | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala index 35b6bb6fd6a..1e7055b534c 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala @@ -33,15 +33,15 @@ class CelebornFetchFailureDiskCleanSuite extends FetchFailureDiskCleanBase { workerDirs, shuffleIdToBeDeleted = Seq(0)) TestCelebornShuffleManager.registerReaderGetHook(hook) - val value = Range(1, 32).mkString(",") + val value = Range(1, 10000).mkString(",") val checkingThread = triggerStorageCheckThread(Seq(0), Seq(1), sparkSession, forStableStatusChecking = false) - val tuples = sparkSession.sparkContext.parallelize(1 to 32, 2) + val tuples = sparkSession.sparkContext.parallelize(1 to 10000, 2) .map { i => (i, value) }.groupByKey(16).collect() checkStorageValidation(checkingThread) // verify result assert(hook.executed.get()) - assert(tuples.length == 32) + assert(tuples.length == 10000) for (elem <- tuples) { assert(elem._2.mkString(",").equals(value)) } @@ -67,7 +67,7 @@ class CelebornFetchFailureDiskCleanSuite extends FetchFailureDiskCleanBase { Seq(2, 3, 4), sparkSession, forStableStatusChecking = false) - val tuples = sparkSession.sparkContext.parallelize(1 to 32, 2) + val tuples = sparkSession.sparkContext.parallelize(1 to 10000, 2) .map { i => (i, value) }.groupByKey(16).map { case (k, elements) => (k, elements.map(str => str.toLowerCase)) @@ -75,7 +75,7 @@ class CelebornFetchFailureDiskCleanSuite extends FetchFailureDiskCleanBase { checkStorageValidation(checkingThread) // verify result assert(hook.executed.get()) - assert(tuples.length == 32) + assert(tuples.length == 10000) for (elem <- tuples) { assert(elem._2.flatten.flatten.mkString(",").equals(value)) } From 61ef8fcca18dc551b817c5638b13acb5fc1e11ec Mon Sep 17 00:00:00 2001 From: CodingCat Date: Sat, 10 May 2025 12:59:56 -0700 Subject: [PATCH 086/120] code cleanup --- .../org/apache/celeborn/spark/FailedShuffleCleaner.scala | 4 +--- .../tests/spark/fetch_failure/FetchFailureTestBase.scala | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala index 0afc5f24f93..3adb17ebb85 100644 --- a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala +++ b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala @@ -17,7 +17,7 @@ package org.apache.celeborn.spark import java.util -import java.util.concurrent.{ConcurrentHashMap, LinkedBlockingQueue, ScheduledExecutorService, TimeUnit} +import java.util.concurrent.{LinkedBlockingQueue, ScheduledExecutorService, TimeUnit} import java.util.concurrent.atomic.AtomicReference import scala.collection.JavaConverters._ @@ -36,8 +36,6 @@ private[celeborn] object FailedShuffleCleaner extends Logging { private val shufflesToBeCleaned = new LinkedBlockingQueue[Int]() private val cleanedShuffleIds = new mutable.HashSet[Int] - private val lock = new Object - private lazy val cleanInterval = lifecycleManager.get().conf.clientFetchCleanFailedShuffleIntervalMS diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureTestBase.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureTestBase.scala index 4d9d1efb50d..902893c8bc5 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureTestBase.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureTestBase.scala @@ -35,7 +35,7 @@ private[tests] trait FetchFailureTestBase extends SparkTestBase { } else { "fetch-failure-failed-shuffle-clean" } - }).setMaster("local[2,3]") + }).setMaster("local[1,4]") var baseBuilder = SparkSession.builder() .config(updateSparkConf(sparkConf, ShuffleMode.HASH)) .config("spark.sql.shuffle.partitions", 2) From 6a10ef8fb511e78159df09bec54282ff8b78b625 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Sat, 10 May 2025 19:31:16 -0700 Subject: [PATCH 087/120] use more cores --- .../tests/spark/fetch_failure/FetchFailureTestBase.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureTestBase.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureTestBase.scala index 902893c8bc5..10f4f84578c 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureTestBase.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureTestBase.scala @@ -35,7 +35,7 @@ private[tests] trait FetchFailureTestBase extends SparkTestBase { } else { "fetch-failure-failed-shuffle-clean" } - }).setMaster("local[1,4]") + }).setMaster("local[*]") var baseBuilder = SparkSession.builder() .config(updateSparkConf(sparkConf, ShuffleMode.HASH)) .config("spark.sql.shuffle.partitions", 2) From 76e92e4044a1169c6676f87fddf611e48171a35f Mon Sep 17 00:00:00 2001 From: CodingCat Date: Tue, 13 May 2025 17:15:51 -0700 Subject: [PATCH 088/120] add back original test --- .../CelebornFetchFailureDiskCleanSuite.scala | 48 ++++++++++++++++--- 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala index 1e7055b534c..45715aeec5d 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala @@ -33,17 +33,16 @@ class CelebornFetchFailureDiskCleanSuite extends FetchFailureDiskCleanBase { workerDirs, shuffleIdToBeDeleted = Seq(0)) TestCelebornShuffleManager.registerReaderGetHook(hook) - val value = Range(1, 10000).mkString(",") val checkingThread = triggerStorageCheckThread(Seq(0), Seq(1), sparkSession, forStableStatusChecking = false) val tuples = sparkSession.sparkContext.parallelize(1 to 10000, 2) - .map { i => (i, value) }.groupByKey(16).collect() + .map { i => (i, s"$i") }.groupByKey(16).collect() checkStorageValidation(checkingThread) // verify result assert(hook.executed.get()) assert(tuples.length == 10000) for (elem <- tuples) { - assert(elem._2.mkString(",").equals(value)) + elem._2.foreach(s => assert(s.equals(elem._1.toString))) } sparkSession.stop() } @@ -61,14 +60,13 @@ class CelebornFetchFailureDiskCleanSuite extends FetchFailureDiskCleanBase { shuffleIdToBeDeleted = Seq(0, 1), triggerStageId = Some(2)) TestCelebornShuffleManager.registerReaderGetHook(hook) - val value = Range(1, 32).mkString(",") val checkingThread = triggerStorageCheckThread( Seq(0, 1), Seq(2, 3, 4), sparkSession, forStableStatusChecking = false) val tuples = sparkSession.sparkContext.parallelize(1 to 10000, 2) - .map { i => (i, value) }.groupByKey(16).map { + .map { i => (i, i.toString) }.groupByKey(16).map { case (k, elements) => (k, elements.map(str => str.toLowerCase)) }.groupByKey(4).groupByKey(2).collect() @@ -77,7 +75,7 @@ class CelebornFetchFailureDiskCleanSuite extends FetchFailureDiskCleanBase { assert(hook.executed.get()) assert(tuples.length == 10000) for (elem <- tuples) { - assert(elem._2.flatten.flatten.mkString(",").equals(value)) + elem._2.flatten.flatten.foreach(s => s.equals(elem._1.toString)) } sparkSession.stop() } @@ -136,4 +134,42 @@ class CelebornFetchFailureDiskCleanSuite extends FetchFailureDiskCleanBase { sparkSession.stop() } } + + test("celeborn spark integration test - (M-1 dep with multi-level lineage) the failed shuffle files are all cleaned up" + + " correctly") { + if (Spark3OrNewer) { + val sparkSession = createSparkSession(enableFailedShuffleCleaner = true) + + val celebornConf = SparkUtils.fromSparkConf(sparkSession.sparkContext.getConf) + val hook = new FileDeletionShuffleReaderGetHook( + celebornConf, + workerDirs, + shuffleIdToBeDeleted = Seq(0, 1, 2, 3), + triggerStageId = Some(4)) + TestCelebornShuffleManager.registerReaderGetHook(hook) + + val checkingThread = triggerStorageCheckThread( + Seq(0, 1, 2, 3), + Seq(4, 5, 6, 7), + sparkSession, + forStableStatusChecking = false) + + import sparkSession.implicits._ + val df1 = Seq((1, "a"), (2, "b")).toDF("id", "data").groupBy("id").count() + .withColumnRenamed("count", "countId").groupBy("countId").count() + .withColumnRenamed("count", "df1_count") + val df2 = Seq((2, "c"), (3, "d")).toDF("id", "data").groupBy("id").count() + .withColumnRenamed("count", "countId").groupBy("countId").count() + .withColumnRenamed("count", "df2_count") + + val tuples = df1.hint("merge").join(df2, "countId").select("*").collect() + + checkStorageValidation(checkingThread) + // verify result + assert(hook.executed.get()) + val expect = "[1,2,2]" + assert(tuples.head.toString().equals(expect)) + sparkSession.stop() + } + } } From f63c81ebf8f95cc38a2a7ca113446bdc7f199118 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Tue, 13 May 2025 18:19:43 -0700 Subject: [PATCH 089/120] stylistic fixes --- .../java/org/apache/spark/shuffle/celeborn/SparkUtils.java | 4 ++-- .../scala/org/apache/celeborn/client/LifecycleManager.scala | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java index e285be25a18..ca663fc39d5 100644 --- a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java +++ b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java @@ -136,8 +136,8 @@ public static int celebornShuffleId( TaskContext context, Boolean isWriter) { if (handle.throwsFetchFailure()) { - String appShuffleIdentifier = SparkCommonUtils.encodeAppShuffleIdentifier(handle.shuffleId(), - context); + String appShuffleIdentifier = + SparkCommonUtils.encodeAppShuffleIdentifier(handle.shuffleId(), context); Tuple2 res = client.getShuffleId( handle.shuffleId(), diff --git a/client/src/main/scala/org/apache/celeborn/client/LifecycleManager.scala b/client/src/main/scala/org/apache/celeborn/client/LifecycleManager.scala index a9685623d6f..05768b65a28 100644 --- a/client/src/main/scala/org/apache/celeborn/client/LifecycleManager.scala +++ b/client/src/main/scala/org/apache/celeborn/client/LifecycleManager.scala @@ -957,7 +957,8 @@ class LifecycleManager(val appUniqueId: String, val conf: CelebornConf) extends val pbGetShuffleIdResponse = { logDebug( s"get shuffleId $celebornShuffleId for appShuffleId $appShuffleId appShuffleIdentifier $appShuffleIdentifier isWriter $isWriter") - PbGetShuffleIdResponse.newBuilder().setShuffleId(celebornShuffleId).setSuccess(true).build() + PbGetShuffleIdResponse.newBuilder().setShuffleId(celebornShuffleId).setSuccess( + true).build() } context.reply(pbGetShuffleIdResponse) case None => From 6a60db651d0f5e064853a8f521a35fd36bfe9720 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Tue, 13 May 2025 19:20:52 -0700 Subject: [PATCH 090/120] less data --- pom.xml | 4 ++-- .../tests/spark/CelebornFetchFailureDiskCleanSuite.scala | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pom.xml b/pom.xml index 7d8cd2b36df..056be6b478f 100644 --- a/pom.xml +++ b/pom.xml @@ -907,7 +907,7 @@ file:src/test/resources/log4j.properties src/test/resources/log4j2-test.xml ${project.build.directory}/tmp - 4g + 10g ${spark.shuffle.plugin.class} @@ -946,7 +946,7 @@ file:src/test/resources/log4j.properties src/test/resources/log4j2-test.xml ${project.build.directory}/tmp - 4g + 10g ${spark.shuffle.plugin.class} diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala index 45715aeec5d..232d4cc6891 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala @@ -35,12 +35,12 @@ class CelebornFetchFailureDiskCleanSuite extends FetchFailureDiskCleanBase { TestCelebornShuffleManager.registerReaderGetHook(hook) val checkingThread = triggerStorageCheckThread(Seq(0), Seq(1), sparkSession, forStableStatusChecking = false) - val tuples = sparkSession.sparkContext.parallelize(1 to 10000, 2) + val tuples = sparkSession.sparkContext.parallelize(1 to 1000, 2) .map { i => (i, s"$i") }.groupByKey(16).collect() checkStorageValidation(checkingThread) // verify result assert(hook.executed.get()) - assert(tuples.length == 10000) + assert(tuples.length == 1000) for (elem <- tuples) { elem._2.foreach(s => assert(s.equals(elem._1.toString))) } @@ -65,7 +65,7 @@ class CelebornFetchFailureDiskCleanSuite extends FetchFailureDiskCleanBase { Seq(2, 3, 4), sparkSession, forStableStatusChecking = false) - val tuples = sparkSession.sparkContext.parallelize(1 to 10000, 2) + val tuples = sparkSession.sparkContext.parallelize(1 to 1000, 2) .map { i => (i, i.toString) }.groupByKey(16).map { case (k, elements) => (k, elements.map(str => str.toLowerCase)) @@ -73,7 +73,7 @@ class CelebornFetchFailureDiskCleanSuite extends FetchFailureDiskCleanBase { checkStorageValidation(checkingThread) // verify result assert(hook.executed.get()) - assert(tuples.length == 10000) + assert(tuples.length == 1000) for (elem <- tuples) { elem._2.flatten.flatten.foreach(s => s.equals(elem._1.toString)) } From ba7b882500a84f821fccf49ff1cbbc49781a5ca1 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Tue, 13 May 2025 20:15:52 -0700 Subject: [PATCH 091/120] further reduce memory overhead --- .../CelebornFetchFailureDiskCleanSuite.scala | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala index 232d4cc6891..a04bc3164db 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala @@ -35,14 +35,14 @@ class CelebornFetchFailureDiskCleanSuite extends FetchFailureDiskCleanBase { TestCelebornShuffleManager.registerReaderGetHook(hook) val checkingThread = triggerStorageCheckThread(Seq(0), Seq(1), sparkSession, forStableStatusChecking = false) - val tuples = sparkSession.sparkContext.parallelize(1 to 1000, 2) - .map { i => (i, s"$i") }.groupByKey(16).collect() + val tuples = sparkSession.sparkContext.parallelize(1 to 100, 2) + .map { i => (i, i) }.groupByKey(16).collect() checkStorageValidation(checkingThread) // verify result assert(hook.executed.get()) - assert(tuples.length == 1000) + assert(tuples.length == 100) for (elem <- tuples) { - elem._2.foreach(s => assert(s.equals(elem._1.toString))) + elem._2.foreach(i => assert(i.equals(elem._1))) } sparkSession.stop() } @@ -65,17 +65,16 @@ class CelebornFetchFailureDiskCleanSuite extends FetchFailureDiskCleanBase { Seq(2, 3, 4), sparkSession, forStableStatusChecking = false) - val tuples = sparkSession.sparkContext.parallelize(1 to 1000, 2) - .map { i => (i, i.toString) }.groupByKey(16).map { - case (k, elements) => - (k, elements.map(str => str.toLowerCase)) + val tuples = sparkSession.sparkContext.parallelize(1 to 100, 2) + .map { i => (i, i) }.groupByKey(16).map { + case (k, elements) => (k, elements.map(i => i)) }.groupByKey(4).groupByKey(2).collect() checkStorageValidation(checkingThread) // verify result assert(hook.executed.get()) - assert(tuples.length == 1000) + assert(tuples.length == 100) for (elem <- tuples) { - elem._2.flatten.flatten.foreach(s => s.equals(elem._1.toString)) + elem._2.flatten.flatten.foreach(s => s.equals(elem._1)) } sparkSession.stop() } From e6f87fdd5324ea3993ade8a6154023d3b3e7ef3a Mon Sep 17 00:00:00 2001 From: CodingCat Date: Tue, 13 May 2025 21:05:10 -0700 Subject: [PATCH 092/120] addr comments --- .../scala/org/apache/celeborn/common/CelebornConf.scala | 8 -------- pom.xml | 4 ++-- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala b/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala index bd094fb04d4..86515916794 100644 --- a/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala +++ b/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala @@ -4824,14 +4824,6 @@ object CelebornConf extends Logging { .booleanConf .createWithDefault(false) - val CLIENT_FETCH_CLEAN_FAILED_SHUFFLE_RUNNING_STAGE_MGR_IMPL: ConfigEntry[String] = - buildConf("celeborn.client.spark.fetch.cleanFailedShuffle.runningStageManagerImpl") - .categories("client") - .version("0.6.0") - .doc("full class name of of running stage manager implementation, mainly for test") - .stringConf - .createWithDefault("org.apache.spark.shuffle.celeborn.RunningStageManagerImpl") - val CLIENT_FETCH_CLEAN_FAILED_SHUFFLE_INTERVAL: ConfigEntry[Long] = buildConf("celeborn.client.spark.fetch.cleanFailedShuffleInterval") .categories("client") diff --git a/pom.xml b/pom.xml index 056be6b478f..dfc8a04ca21 100644 --- a/pom.xml +++ b/pom.xml @@ -907,7 +907,7 @@ file:src/test/resources/log4j.properties src/test/resources/log4j2-test.xml ${project.build.directory}/tmp - 10g + 6g ${spark.shuffle.plugin.class} @@ -946,7 +946,7 @@ file:src/test/resources/log4j.properties src/test/resources/log4j2-test.xml ${project.build.directory}/tmp - 10g + 6g ${spark.shuffle.plugin.class} From 00914c3a71609bef48a2b0075340710c112de3aa Mon Sep 17 00:00:00 2001 From: CodingCat Date: Tue, 13 May 2025 21:12:22 -0700 Subject: [PATCH 093/120] doc update --- docs/configuration/client.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/configuration/client.md b/docs/configuration/client.md index 9e697ca48ba..e4e8e0e83ee 100644 --- a/docs/configuration/client.md +++ b/docs/configuration/client.md @@ -112,7 +112,6 @@ license: | | celeborn.client.shuffle.reviseLostShuffles.enabled | false | false | Whether to revise lost shuffles. | 0.6.0 | | | celeborn.client.slot.assign.maxWorkers | 10000 | false | Max workers that slots of one shuffle can be allocated on. Will choose the smaller positive one from Master side and Client side, see `celeborn.master.slot.assign.maxWorkers`. | 0.3.1 | | | celeborn.client.spark.fetch.cleanFailedShuffle | false | false | whether to clean those disk space occupied by shuffles which cannot be fetched | 0.6.0 | | -| celeborn.client.spark.fetch.cleanFailedShuffle.runningStageManagerImpl | org.apache.spark.shuffle.celeborn.RunningStageManagerImpl | false | full class name of of running stage manager implementation, mainly for test | 0.6.0 | | | celeborn.client.spark.fetch.cleanFailedShuffleInterval | 1s | false | the interval to clean the failed-to-fetch shuffle files, only valid when celeborn.client.spark.fetch.cleanFailedShuffle is enabled | 0.6.0 | | | celeborn.client.spark.push.dynamicWriteMode.enabled | false | false | Whether to dynamically switch push write mode based on conditions.If true, shuffle mode will be only determined by partition count | 0.5.0 | | | celeborn.client.spark.push.dynamicWriteMode.partitionNum.threshold | 2000 | false | Threshold of shuffle partition number for dynamically switching push writer mode. When the shuffle partition number is greater than this value, use the sort-based shuffle writer for memory efficiency; otherwise use the hash-based shuffle writer for speed. This configuration only takes effect when celeborn.client.spark.push.dynamicWriteMode.enabled is true. | 0.5.0 | | From c5a64950606a77ce07f8b1bbba4d2035acaff818 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Tue, 13 May 2025 22:06:24 -0700 Subject: [PATCH 094/120] 10g --- pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index dfc8a04ca21..056be6b478f 100644 --- a/pom.xml +++ b/pom.xml @@ -907,7 +907,7 @@ file:src/test/resources/log4j.properties src/test/resources/log4j2-test.xml ${project.build.directory}/tmp - 6g + 10g ${spark.shuffle.plugin.class} @@ -946,7 +946,7 @@ file:src/test/resources/log4j.properties src/test/resources/log4j2-test.xml ${project.build.directory}/tmp - 6g + 10g ${spark.shuffle.plugin.class} From 802431f403a2ef01200fbf30621d7dab2839e051 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Tue, 13 May 2025 23:12:23 -0700 Subject: [PATCH 095/120] further reduce test data --- .../spark/CelebornFetchFailureDiskCleanSuite.scala | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala index a04bc3164db..e1efc8f2fb0 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala @@ -35,12 +35,12 @@ class CelebornFetchFailureDiskCleanSuite extends FetchFailureDiskCleanBase { TestCelebornShuffleManager.registerReaderGetHook(hook) val checkingThread = triggerStorageCheckThread(Seq(0), Seq(1), sparkSession, forStableStatusChecking = false) - val tuples = sparkSession.sparkContext.parallelize(1 to 100, 2) - .map { i => (i, i) }.groupByKey(16).collect() + val tuples = sparkSession.sparkContext.parallelize(1 to 10, 2) + .map { i => (i, i) }.groupByKey(4).collect() checkStorageValidation(checkingThread) // verify result assert(hook.executed.get()) - assert(tuples.length == 100) + assert(tuples.length == 10) for (elem <- tuples) { elem._2.foreach(i => assert(i.equals(elem._1))) } @@ -65,14 +65,14 @@ class CelebornFetchFailureDiskCleanSuite extends FetchFailureDiskCleanBase { Seq(2, 3, 4), sparkSession, forStableStatusChecking = false) - val tuples = sparkSession.sparkContext.parallelize(1 to 100, 2) + val tuples = sparkSession.sparkContext.parallelize(1 to 10, 2) .map { i => (i, i) }.groupByKey(16).map { case (k, elements) => (k, elements.map(i => i)) }.groupByKey(4).groupByKey(2).collect() checkStorageValidation(checkingThread) // verify result assert(hook.executed.get()) - assert(tuples.length == 100) + assert(tuples.length == 10) for (elem <- tuples) { elem._2.flatten.flatten.foreach(s => s.equals(elem._1)) } From 7b09c435578b922e3787d951df68a416c36a3480 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Wed, 14 May 2025 07:50:21 -0700 Subject: [PATCH 096/120] enlength timeout --- .../tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala index a2f3d7909b6..c303d54eaf1 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala @@ -141,7 +141,7 @@ private[tests] trait FetchFailureDiskCleanBase extends AnyFunSuite checkingThread } - protected def checkStorageValidation(thread: Thread, timeout: Long = 240 * 1000): Unit = { + protected def checkStorageValidation(thread: Thread, timeout: Long = 600 * 1000): Unit = { val checkingThread = thread.asInstanceOf[CheckingThread] checkingThread.join(timeout) if (checkingThread.isAlive || checkingThread.exception != null) { From d8f0a275f2b1be5ae0b108f8357610009de59a5b Mon Sep 17 00:00:00 2001 From: CodingCat Date: Wed, 14 May 2025 08:47:49 -0700 Subject: [PATCH 097/120] recover to 240 --- .../tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala index c303d54eaf1..a2f3d7909b6 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala @@ -141,7 +141,7 @@ private[tests] trait FetchFailureDiskCleanBase extends AnyFunSuite checkingThread } - protected def checkStorageValidation(thread: Thread, timeout: Long = 600 * 1000): Unit = { + protected def checkStorageValidation(thread: Thread, timeout: Long = 240 * 1000): Unit = { val checkingThread = thread.asInstanceOf[CheckingThread] checkingThread.join(timeout) if (checkingThread.isAlive || checkingThread.exception != null) { From 79160592074da2dbead464b35eec9c97a0f642f4 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Wed, 14 May 2025 08:49:29 -0700 Subject: [PATCH 098/120] rm one expensive test --- .../CelebornFetchFailureDiskCleanSuite.scala | 38 ------------------- 1 file changed, 38 deletions(-) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala index e1efc8f2fb0..faac37903ba 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/CelebornFetchFailureDiskCleanSuite.scala @@ -133,42 +133,4 @@ class CelebornFetchFailureDiskCleanSuite extends FetchFailureDiskCleanBase { sparkSession.stop() } } - - test("celeborn spark integration test - (M-1 dep with multi-level lineage) the failed shuffle files are all cleaned up" + - " correctly") { - if (Spark3OrNewer) { - val sparkSession = createSparkSession(enableFailedShuffleCleaner = true) - - val celebornConf = SparkUtils.fromSparkConf(sparkSession.sparkContext.getConf) - val hook = new FileDeletionShuffleReaderGetHook( - celebornConf, - workerDirs, - shuffleIdToBeDeleted = Seq(0, 1, 2, 3), - triggerStageId = Some(4)) - TestCelebornShuffleManager.registerReaderGetHook(hook) - - val checkingThread = triggerStorageCheckThread( - Seq(0, 1, 2, 3), - Seq(4, 5, 6, 7), - sparkSession, - forStableStatusChecking = false) - - import sparkSession.implicits._ - val df1 = Seq((1, "a"), (2, "b")).toDF("id", "data").groupBy("id").count() - .withColumnRenamed("count", "countId").groupBy("countId").count() - .withColumnRenamed("count", "df1_count") - val df2 = Seq((2, "c"), (3, "d")).toDF("id", "data").groupBy("id").count() - .withColumnRenamed("count", "countId").groupBy("countId").count() - .withColumnRenamed("count", "df2_count") - - val tuples = df1.hint("merge").join(df2, "countId").select("*").collect() - - checkStorageValidation(checkingThread) - // verify result - assert(hook.executed.get()) - val expect = "[1,2,2]" - assert(tuples.head.toString().equals(expect)) - sparkSession.stop() - } - } } From c5dbaf54d5ef92e61792ccd56775069f2607b0db Mon Sep 17 00:00:00 2001 From: CodingCat Date: Wed, 14 May 2025 10:46:13 -0700 Subject: [PATCH 099/120] check faster --- .../tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala index a2f3d7909b6..cbeb1b40645 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala @@ -89,7 +89,7 @@ private[tests] trait FetchFailureDiskCleanBase extends AnyFunSuite override def run(): Unit = { var allDataInShape = checkDirStatus() while (!allDataInShape) { - Thread.sleep(1000) + Thread.sleep(100) allDataInShape = checkDirStatus() } } From 8b99b7ab0451a0bec2fa99bd434dbe027e665162 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Wed, 14 May 2025 19:18:51 -0700 Subject: [PATCH 100/120] check per sec --- .../tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala index cbeb1b40645..a2f3d7909b6 100644 --- a/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala +++ b/tests/spark-it/src/test/scala/org/apache/celeborn/tests/spark/fetch_failure/FetchFailureDiskCleanBase.scala @@ -89,7 +89,7 @@ private[tests] trait FetchFailureDiskCleanBase extends AnyFunSuite override def run(): Unit = { var allDataInShape = checkDirStatus() while (!allDataInShape) { - Thread.sleep(100) + Thread.sleep(1000) allDataInShape = checkDirStatus() } } From e18a6d964b27b9930129d0d93ea3f40b38cd0209 Mon Sep 17 00:00:00 2001 From: CodingCat Date: Fri, 16 May 2025 07:32:38 -0700 Subject: [PATCH 101/120] addr comments --- .../celeborn/spark/FailedShuffleCleaner.scala | 60 +++++++++---------- .../shuffle/celeborn/SparkShuffleManager.java | 15 +++-- .../spark/shuffle/celeborn/SparkUtils.java | 12 ++-- pom.xml | 4 +- .../FetchFailureDiskCleanBase.scala | 1 - 5 files changed, 44 insertions(+), 48 deletions(-) diff --git a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala index 3adb17ebb85..e88f6f640be 100644 --- a/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala +++ b/client-spark/common/src/main/scala/org/apache/celeborn/spark/FailedShuffleCleaner.scala @@ -18,7 +18,6 @@ package org.apache.celeborn.spark import java.util import java.util.concurrent.{LinkedBlockingQueue, ScheduledExecutorService, TimeUnit} -import java.util.concurrent.atomic.AtomicReference import scala.collection.JavaConverters._ import scala.collection.mutable @@ -29,15 +28,14 @@ import org.apache.celeborn.client.LifecycleManager import org.apache.celeborn.common.internal.Logging import org.apache.celeborn.common.util.ThreadUtils -private[celeborn] object FailedShuffleCleaner extends Logging { +private[celeborn] class FailedShuffleCleaner(lifecycleManager: LifecycleManager) extends Logging { - private val lifecycleManager = new AtomicReference[LifecycleManager](null) // in celeborn ids private val shufflesToBeCleaned = new LinkedBlockingQueue[Int]() private val cleanedShuffleIds = new mutable.HashSet[Int] private lazy val cleanInterval = - lifecycleManager.get().conf.clientFetchCleanFailedShuffleIntervalMS + lifecycleManager.conf.clientFetchCleanFailedShuffleIntervalMS // for test def reset(): Unit = { @@ -47,48 +45,46 @@ private[celeborn] object FailedShuffleCleaner extends Logging { cleanerThreadPool.shutdownNow() cleanerThreadPool = null } - lifecycleManager.set(null) } def addShuffleIdToBeCleaned(appShuffleIdentifier: String): Unit = { val Array(appShuffleId, _, _) = SparkCommonUtils.decodeAppShuffleIdentifier( appShuffleIdentifier) - lifecycleManager.get().getShuffleIdMapping.get(appShuffleId.toInt).foreach { + lifecycleManager.getShuffleIdMapping.get(appShuffleId.toInt).foreach { case (_, (celebornShuffleId, _)) => shufflesToBeCleaned.put(celebornShuffleId) } } - def setLifecycleManager(ref: LifecycleManager): Unit = { - val firstSet = lifecycleManager.compareAndSet(null, ref) - if (firstSet) { - cleanerThreadPool = ThreadUtils.newDaemonSingleThreadScheduledExecutor( - "failedShuffleCleanerThreadPool") - cleanerThreadPool.scheduleWithFixedDelay( - new Runnable { - override def run(): Unit = { - try { - val allShuffleIds = new util.ArrayList[Int] - shufflesToBeCleaned.drainTo(allShuffleIds) - allShuffleIds.asScala.foreach { shuffleId => - if (!cleanedShuffleIds.contains(shuffleId)) { - lifecycleManager.get().unregisterShuffle(shuffleId) - logInfo( - s"sent unregister shuffle request for shuffle $shuffleId (celeborn shuffle id)") - cleanedShuffleIds += shuffleId - } + def init(): Unit = { + cleanerThreadPool = ThreadUtils.newDaemonSingleThreadScheduledExecutor( + "failedShuffleCleanerThreadPool") + cleanerThreadPool.scheduleWithFixedDelay( + new Runnable { + override def run(): Unit = { + try { + val allShuffleIds = new util.ArrayList[Int] + shufflesToBeCleaned.drainTo(allShuffleIds) + allShuffleIds.asScala.foreach { shuffleId => + if (!cleanedShuffleIds.contains(shuffleId)) { + lifecycleManager.unregisterShuffle(shuffleId) + logInfo( + s"sent unregister shuffle request for shuffle $shuffleId (celeborn shuffle id)") + cleanedShuffleIds += shuffleId } - } catch { - case e: Exception => - logError("unexpected exception in cleaner thread", e) } + } catch { + case e: Exception => + logError("unexpected exception in cleaner thread", e) } - }, - cleanInterval, - cleanInterval, - TimeUnit.MILLISECONDS) - } + } + }, + cleanInterval, + cleanInterval, + TimeUnit.MILLISECONDS) } + init() + def removeCleanedShuffleId(celebornShuffleId: Int): Unit = { cleanedShuffleIds.remove(celebornShuffleId) } diff --git a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java index 9327a8dc059..17c398e07be 100644 --- a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java +++ b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkShuffleManager.java @@ -85,6 +85,8 @@ public class SparkShuffleManager implements ShuffleManager { ConcurrentHashMap.newKeySet(); private final CelebornShuffleFallbackPolicyRunner fallbackPolicyRunner; + private FailedShuffleCleaner failedShuffleCleaner = null; + private long sendBufferPoolCheckInterval; private long sendBufferPoolExpireTimeout; @@ -167,13 +169,12 @@ private void initializeLifecycleManager(String appId) { + CelebornConf.CLIENT_FETCH_CLEAN_FAILED_SHUFFLE().key() + " is set to true"); } + failedShuffleCleaner = new FailedShuffleCleaner(lifecycleManager); lifecycleManager.registerValidateCelebornShuffleIdForCleanCallback( (appShuffleIdentifier) -> - SparkUtils.addWriterShuffleIdsToBeCleaned( - lifecycleManager, appShuffleIdentifier)); + SparkUtils.addWriterShuffleIdsToBeCleaned(this, appShuffleIdentifier)); lifecycleManager.registerUnregisterShuffleCallback( - (celebornShuffleId) -> - SparkUtils.removeCleanedShuffleId(lifecycleManager, celebornShuffleId)); + (celebornShuffleId) -> SparkUtils.removeCleanedShuffleId(this, celebornShuffleId)); } if (celebornConf.getReducerFileGroupBroadcastEnabled()) { @@ -267,7 +268,7 @@ public void stop() { _sortShuffleManager = null; } if (celebornConf.clientFetchCleanFailedShuffle()) { - FailedShuffleCleaner.reset(); + failedShuffleCleaner.reset(); } } @@ -490,4 +491,8 @@ private void checkUserClassPathFirst(ShuffleHandle handle) { public LifecycleManager getLifecycleManager() { return this.lifecycleManager; } + + public FailedShuffleCleaner getFailedShuffleCleaner() { + return this.failedShuffleCleaner; + } } diff --git a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java index ca663fc39d5..fc5d605d8ac 100644 --- a/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java +++ b/client-spark/spark-3/src/main/java/org/apache/spark/shuffle/celeborn/SparkUtils.java @@ -64,7 +64,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.celeborn.client.LifecycleManager; import org.apache.celeborn.client.ShuffleClient; import org.apache.celeborn.common.CelebornConf; import org.apache.celeborn.common.exception.CelebornRuntimeException; @@ -76,7 +75,6 @@ import org.apache.celeborn.reflect.DynConstructors; import org.apache.celeborn.reflect.DynFields; import org.apache.celeborn.reflect.DynMethods; -import org.apache.celeborn.spark.FailedShuffleCleaner; public class SparkUtils { private static final Logger LOG = LoggerFactory.getLogger(SparkUtils.class); @@ -627,14 +625,12 @@ public static void invalidateSerializedGetReducerFileGroupResponse(Integer shuff } public static void addWriterShuffleIdsToBeCleaned( - LifecycleManager lifecycleManager, String appShuffleIdentifier) { - FailedShuffleCleaner.setLifecycleManager(lifecycleManager); - FailedShuffleCleaner.addShuffleIdToBeCleaned(appShuffleIdentifier); + SparkShuffleManager sparkShuffleManager, String appShuffleIdentifier) { + sparkShuffleManager.getFailedShuffleCleaner().addShuffleIdToBeCleaned(appShuffleIdentifier); } public static void removeCleanedShuffleId( - LifecycleManager lifecycleManager, int celebornShuffleId) { - FailedShuffleCleaner.setLifecycleManager(lifecycleManager); - FailedShuffleCleaner.removeCleanedShuffleId(celebornShuffleId); + SparkShuffleManager sparkShuffleManager, int celebornShuffleId) { + sparkShuffleManager.getFailedShuffleCleaner().removeCleanedShuffleId(celebornShuffleId); } } diff --git a/pom.xml b/pom.xml index 056be6b478f..2b12b3aeac6 100644 --- a/pom.xml +++ b/pom.xml @@ -902,7 +902,7 @@ **/*Test*.* ${project.build.directory}/surefire-reports - ${argLine} -ea -Xmx4g -Xss4m -XX:MaxMetaspaceSize=2g -XX:ReservedCodeCacheSize=128m ${extraJavaTestArgs} -Dio.netty.tryReflectionSetAccessible=true + ${argLine} -ea -Xmx10g -Xss2g -XX:MaxMetaspaceSize=2g -XX:ReservedCodeCacheSize=128m ${extraJavaTestArgs} -Dio.netty.tryReflectionSetAccessible=true file:src/test/resources/log4j.properties src/test/resources/log4j2-test.xml @@ -935,7 +935,7 @@ ${maven.plugin.scalatest.version} ${project.build.directory}/surefire-reports - ${argLine} -ea -Xmx4g -Xss4m -XX:MaxMetaspaceSize=2g -XX:ReservedCodeCacheSize=128m ${extraJavaTestArgs} -Dio.netty.tryReflectionSetAccessible=true + ${argLine} -ea -Xmx10g -Xss2g -XX:MaxMetaspaceSize=2g -XX:ReservedCodeCacheSize=128m ${extraJavaTestArgs} -Dio.netty.tryReflectionSetAccessible=true TestSuite.txt