Fixes

s0nskar · s0nskar · commit 3094c92a0496 · 2026-06-11T00:43:33.000+05:30
diff --git a/client/src/main/scala/org/apache/celeborn/client/CommitManager.scala b/client/src/main/scala/org/apache/celeborn/client/CommitManager.scala
@@ -372,7 +372,8 @@ class CommitManager(appUniqueId: String, val conf: CelebornConf, lifecycleManage
   }
 
   private class UnknownWorkerListener extends WorkerStatusListener {
-    private val shuffleDataLostOnUnknownWorkerEnabled = conf.clientShuffleDataLostOnUnknownWorkerEnabled
+    private val shuffleDataLostOnUnknownWorkerEnabled =
+      conf.clientShuffleDataLostOnUnknownWorkerEnabled
     private val pushReplicateEnabled = conf.clientPushReplicateEnabled
 
     override def notifyChangedWorkersStatus(workersStatus: WorkersStatus): Unit = {
diff --git a/client/src/test/scala/org/apache/celeborn/client/CommitManagerSuite.scala b/client/src/test/scala/org/apache/celeborn/client/CommitManagerSuite.scala
@@ -33,18 +33,11 @@ import org.apache.celeborn.CelebornFunSuite
 import org.apache.celeborn.client.LifecycleManager.ShuffleAllocatedWorkers
 import org.apache.celeborn.client.listener.WorkerStatusListener
 import org.apache.celeborn.common.CelebornConf
-import org.apache.celeborn.common.CelebornConf.{
-  CLIENT_BATCH_HANDLE_COMMIT_PARTITION_ENABLED,
-  CLIENT_PUSH_REPLICATE_ENABLED,
-  CLIENT_SHUFFLE_DATA_LOST_ON_UNKNOWN_WORKER_ENABLED
-}
+import org.apache.celeborn.common.CelebornConf.{CLIENT_BATCH_HANDLE_COMMIT_PARTITION_ENABLED, CLIENT_PUSH_REPLICATE_ENABLED, CLIENT_SHUFFLE_DATA_LOST_ON_UNKNOWN_WORKER_ENABLED}
 import org.apache.celeborn.common.meta.{ShufflePartitionLocationInfo, WorkerInfo}
 import org.apache.celeborn.common.network.protocol.SerdeVersion
 import org.apache.celeborn.common.protocol.PartitionType
-import org.apache.celeborn.common.protocol.message.ControlMessages.{
-  GetReducerFileGroupResponse,
-  HeartbeatFromApplicationResponse
-}
+import org.apache.celeborn.common.protocol.message.ControlMessages.{GetReducerFileGroupResponse, HeartbeatFromApplicationResponse}
 import org.apache.celeborn.common.protocol.message.StatusCode
 import org.apache.celeborn.common.rpc.RpcAddress
 import org.apache.celeborn.common.rpc.netty.LocalNettyRpcCallContext
diff --git a/client/src/test/scala/org/apache/celeborn/client/commit/ReducePartitionCommitHandlerSuite.scala b/client/src/test/scala/org/apache/celeborn/client/commit/ReducePartitionCommitHandlerSuite.scala
@@ -23,9 +23,9 @@ import scala.concurrent.{Await, Promise}
 import scala.concurrent.duration._
 
 import org.apache.celeborn.CelebornFunSuite
-import org.apache.celeborn.client.WorkerStatusTracker
 import org.apache.celeborn.client.CommitManager.CommittedPartitionInfo
 import org.apache.celeborn.client.LifecycleManager.ShuffleAllocatedWorkers
+import org.apache.celeborn.client.WorkerStatusTracker
 import org.apache.celeborn.common.CelebornConf
 import org.apache.celeborn.common.network.protocol.SerdeVersion
 import org.apache.celeborn.common.protocol.message.ControlMessages.GetReducerFileGroupResponse
@@ -76,7 +76,11 @@ class ReducePartitionCommitHandlerSuite extends CelebornFunSuite {
   test("markShuffleDataLost replies SHUFFLE_DATA_LOST to GetReducerFileGroup contexts") {
     val handler = newHandler()
     val shuffleId = 1
-    handler.registerShuffle(shuffleId, numMappers = 2, isSegmentGranularityVisible = false, numPartitions = 4)
+    handler.registerShuffle(
+      shuffleId,
+      numMappers = 2,
+      isSegmentGranularityVisible = false,
+      numPartitions = 4)
 
     val (ctx1, p1) = pendingContext()
     handler.handleGetReducerFileGroup(ctx1, shuffleId, SerdeVersion.V1)
@@ -97,10 +101,15 @@ class ReducePartitionCommitHandlerSuite extends CelebornFunSuite {
     }
   }
 
-  test("markShuffleDataLost marks data lost even when stage already ended (worker crash after commit)") {
+  test(
+    "markShuffleDataLost marks data lost even when stage already ended (worker crash after commit)") {
     val handler = newHandler()
     val shuffleId = 1
-    handler.registerShuffle(shuffleId, numMappers = 1, isSegmentGranularityVisible = false, numPartitions = 2)
+    handler.registerShuffle(
+      shuffleId,
+      numMappers = 1,
+      isSegmentGranularityVisible = false,
+      numPartitions = 2)
 
     // Clean stage-end
     handler.setStageEnd(shuffleId)
diff --git a/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala b/common/src/main/scala/org/apache/celeborn/common/CelebornConf.scala
@@ -6998,7 +6998,11 @@ object CelebornConf extends Logging {
     buildConf("celeborn.client.shuffleDataLostOnUnknownWorker.enabled")
       .categories("client")
       .version("0.6.3")
-      .doc("Whether to mark shuffle data lost when unknown worker is detected.")
+      .doc("When enabled, any shuffle that had partitions on the (crashed) " +
+        "unknown worker is immediately marked as data lost. " +
+        "On the write flow revive/commit request for that shuffle will fast fail. " +
+        "GetReducerFileGroup requests are replied with SHUFFLE_DATA_LOST. " +
+        "This has no effect when ${CLIENT_PUSH_REPLICATE_ENABLED.key}=true")
       .booleanConf
       .createWithDefault(true)
 
diff --git a/docs/configuration/client.md b/docs/configuration/client.md
@@ -122,7 +122,7 @@ license: |
 | celeborn.client.shuffle.rangeReadFilter.enabled | false | false | If a spark application have skewed partition, this value can set to true to improve performance. | 0.2.0 | celeborn.shuffle.rangeReadFilter.enabled | 
 | celeborn.client.shuffle.register.filterExcludedWorker.enabled | false | false | Whether to filter excluded worker when register shuffle. | 0.4.0 |  | 
 | celeborn.client.shuffle.reviseLostShuffles.enabled | false | false | Whether to revise lost shuffles. | 0.6.0 |  | 
-| celeborn.client.shuffleDataLostOnUnknownWorker.enabled | false | false | Whether to mark shuffle data lost when unknown worker is detected. | 0.6.3 |  | 
+| celeborn.client.shuffleDataLostOnUnknownWorker.enabled | true | false | When enabled, any shuffle that had partitions on the (crashed) unknown worker is immediately marked as data lost. On the write flow revive/commit request for that shuffle will fast fail. GetReducerFileGroup requests are replied with SHUFFLE_DATA_LOST. This has no effect when ${CLIENT_PUSH_REPLICATE_ENABLED.key}=true | 0.6.3 |  | 
 | celeborn.client.slot.assign.maxWorkers | 10000 | false | Max workers that slots of one shuffle can be allocated on. Will choose the smaller positive one from Master side and Client side, see `celeborn.master.slot.assign.maxWorkers`. | 0.3.1 |  | 
 | celeborn.client.spark.batch.openStream.parallelClientCreation.enabled | true | false | Whether to create data clients in parallel before sending Spark batch open-stream requests. When false, data clients are created serially. | 0.6.3 |  | 
 | celeborn.client.spark.fetch.cleanFailedShuffle | false | false | whether to clean those disk space occupied by shuffles which cannot be fetched | 0.6.0 |  | 
diff --git a/docs/migration.md b/docs/migration.md
@@ -37,6 +37,8 @@ license: |
 
 - Since 0.7.0, Celeborn changed the default value of `celeborn.port.maxRetries` from `1` to `16`.
 
+- Since 0.7.0, Celeborn change the default value of `celeborn.client.shuffleDataLostOnUnknownWorker.enabled` from `false` to `true`, which means Celeborn will treat shuffle data lost when unknown worker is detected at default.
+
 # Upgrading from 0.5 to 0.6
 
 - Since 0.6.0, Celeborn deprecate `celeborn.client.spark.fetch.throwsFetchFailure`. Please use `celeborn.client.spark.stageRerun.enabled` instead.

Original file line number	Diff line number	Diff line change
`@@ -372,7 +372,8 @@ class CommitManager(appUniqueId: String, val conf: CelebornConf, lifecycleManage`
`372`	`372`	`}`
`373`	`373`
`374`	`374`	`private class UnknownWorkerListener extends WorkerStatusListener {`
`375`		`- private val shuffleDataLostOnUnknownWorkerEnabled = conf.clientShuffleDataLostOnUnknownWorkerEnabled`
	`375`	`+ private val shuffleDataLostOnUnknownWorkerEnabled =`
	`376`	`+ conf.clientShuffleDataLostOnUnknownWorkerEnabled`
`376`	`377`	`private val pushReplicateEnabled = conf.clientPushReplicateEnabled`
`377`	`378`
`378`	`379`	`override def notifyChangedWorkersStatus(workersStatus: WorkersStatus): Unit = {`