akka
diff --git a/‎akka-cluster-sharding/src/main/mima-filters/2.10.16.backwards.excludes/self-healing.excludes‎
Lines changed: 4 additions & 0 deletions b/‎akka-cluster-sharding/src/main/mima-filters/2.10.16.backwards.excludes/self-healing.excludes‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎akka-cluster-sharding/src/main/resources/reference.conf‎
Lines changed: 33 additions & 0 deletions b/‎akka-cluster-sharding/src/main/resources/reference.conf‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎akka-cluster-sharding/src/main/scala/akka/cluster/sharding/ClusterShardingSettings.scala‎
Lines changed: 141 additions & 5 deletions b/‎akka-cluster-sharding/src/main/scala/akka/cluster/sharding/ClusterShardingSettings.scala‎
Lines changed: 141 additions & 5 deletions
@@ -0,0 +1,4 @@
+# Self-healing settings added to ClusterShardingSettings constructor
+ProblemFilters.exclude[DirectMissingMethodProblem]("akka.cluster.sharding.ClusterShardingSettings.this")
+ProblemFilters.exclude[DirectMissingMethodProblem]("akka.cluster.sharding.ClusterShardingSettings.copy")
+ProblemFilters.exclude[DirectMissingMethodProblem]("akka.cluster.sharding.ClusterShardingSettings.copy$default$*")
@@ -466,6 +466,39 @@ akka.cluster.sharding {
     # coordinator for that entity type.
     disabled-after = 10s
   }
+
+  # Self-healing configuration for automatic shard state cleanup when cluster coordination is impaired.
+  # When enabled, shards allocated to regions that have been unreachable beyond a configurable threshold
+  # will be automatically deallocated, enabling graceful degradation and recovery.
+  self-healing {
+    # Enable/disable the self-healing mechanism.
+    # When enabled, shards from regions that have been unreachable beyond the
+    # stale-region-timeout will be automatically deallocated and can be reallocated
+    # to healthy regions.
+    enabled = off
+
+    # Time threshold after which an unreachable region's shards are considered stale
+    # and will be deallocated. This should be longer than expected network hiccups
+    # but shorter than manual intervention time.
+    # Recommended: At least 2x the failure-detector acceptable-heartbeat-pause (default 3s).
+    stale-region-timeout = 30s
+
+    # How often to check for stale regions.
+    # This interval controls the resolution of the timeout detection.
+    # Smaller values detect staleness faster but add more CPU overhead.
+    check-interval = 5s
+
+    # Grace period after coordinator startup before self-healing activates.
+    # This prevents premature cleanup during cluster formation when nodes
+    # may be temporarily unreachable as they join.
+    startup-grace-period = 60s
+
+    # Whether to only log warnings instead of actually deallocating shards.
+    # Useful for testing/monitoring the self-healing behavior before enabling
+    # full automatic cleanup. When enabled, logs what would be deallocated
+    # without actually performing the deallocation.
+    dry-run = off
+  }
 }
 # //#sharding-ext-config
 
 
@@ -105,6 +105,8 @@ object ClusterShardingSettings {
           ))
     }
 
+    val selfHealingSettings = SelfHealingSettings(config.getConfig("self-healing"))
+
     new ClusterShardingSettings(
       role = roleOption(config.getString("role")),
       rememberEntities = config.getBoolean("remember-entities"),
@@ -117,7 +119,8 @@ object ClusterShardingSettings {
       tuningParameters,
       config.getBoolean("coordinator-singleton-role-override"),
       coordinatorSingletonSettings,
-      lease)
+      lease,
+      selfHealingSettings)
   }
 
   /**
@@ -909,6 +912,95 @@ object ClusterShardingSettings {
     }
   }
 
+  /**
+   * Settings for the self-healing mechanism that automatically removes stale shard state
+   * when cluster coordination is impaired.
+   */
+  object SelfHealingSettings {
+
+    /**
+     * Create settings from a configuration with the same layout as
+     * the default configuration `akka.cluster.sharding.self-healing`.
+     */
+    def apply(config: Config): SelfHealingSettings = {
+      new SelfHealingSettings(
+        enabled = config.getBoolean("enabled"),
+        staleRegionTimeout = config.getDuration("stale-region-timeout", MILLISECONDS).millis,
+        checkInterval = config.getDuration("check-interval", MILLISECONDS).millis,
+        startupGracePeriod = config.getDuration("startup-grace-period", MILLISECONDS).millis,
+        dryRun = config.getBoolean("dry-run"))
+    }
+
+    /**
+     * Default settings with self-healing disabled.
+     */
+    val disabled: SelfHealingSettings = new SelfHealingSettings(
+      enabled = false,
+      staleRegionTimeout = 30.seconds,
+      checkInterval = 5.seconds,
+      startupGracePeriod = 60.seconds,
+      dryRun = false)
+  }
+
+  /**
+   * Settings for the self-healing mechanism that automatically removes stale shard state
+   * when cluster coordination is impaired.
+   *
+   * @param enabled Whether self-healing is enabled
+   * @param staleRegionTimeout Time threshold after which an unreachable region's shards are considered stale
+   * @param checkInterval How often to check for stale regions
+   * @param startupGracePeriod Grace period after coordinator startup before self-healing activates
+   * @param dryRun If true, only log warnings without actually deallocating shards
+   */
+  final class SelfHealingSettings(
+      val enabled: Boolean,
+      val staleRegionTimeout: FiniteDuration,
+      val checkInterval: FiniteDuration,
+      val startupGracePeriod: FiniteDuration,
+      val dryRun: Boolean) {
+
+    require(staleRegionTimeout > Duration.Zero, "stale-region-timeout must be > 0")
+    require(checkInterval > Duration.Zero, "check-interval must be > 0")
+    require(startupGracePeriod >= Duration.Zero, "startup-grace-period must be >= 0")
+
+    def withEnabled(enabled: Boolean): SelfHealingSettings =
+      copy(enabled = enabled)
+
+    def withStaleRegionTimeout(timeout: FiniteDuration): SelfHealingSettings =
+      copy(staleRegionTimeout = timeout)
+
+    def withStaleRegionTimeout(timeout: java.time.Duration): SelfHealingSettings =
+      copy(staleRegionTimeout = timeout.toScala)
+
+    def withCheckInterval(interval: FiniteDuration): SelfHealingSettings =
+      copy(checkInterval = interval)
+
+    def withCheckInterval(interval: java.time.Duration): SelfHealingSettings =
+      copy(checkInterval = interval.toScala)
+
+    def withStartupGracePeriod(period: FiniteDuration): SelfHealingSettings =
+      copy(startupGracePeriod = period)
+
+    def withStartupGracePeriod(period: java.time.Duration): SelfHealingSettings =
+      copy(startupGracePeriod = period.toScala)
+
+    def withDryRun(dryRun: Boolean): SelfHealingSettings =
+      copy(dryRun = dryRun)
+
+    private def copy(
+        enabled: Boolean = enabled,
+        staleRegionTimeout: FiniteDuration = staleRegionTimeout,
+        checkInterval: FiniteDuration = checkInterval,
+        startupGracePeriod: FiniteDuration = startupGracePeriod,
+        dryRun: Boolean = dryRun): SelfHealingSettings =
+      new SelfHealingSettings(enabled, staleRegionTimeout, checkInterval, startupGracePeriod, dryRun)
+
+    override def toString: String =
+      s"SelfHealingSettings(enabled=$enabled, staleRegionTimeout=$staleRegionTimeout, " +
+      s"checkInterval=$checkInterval, startupGracePeriod=$startupGracePeriod, " +
+      s"dryRun=$dryRun)"
+  }
+
   class TuningParameters(
       val coordinatorFailureBackoff: FiniteDuration,
       val retryInterval: FiniteDuration,
@@ -1176,6 +1268,8 @@ object ClusterShardingSettings {
  *   Note that if you define a custom lease name and have several sharding entity types each one must have a unique
  *   lease name. If the lease name is undefined it will be derived from ActorSystem name and shard name,
  *   but that may result in too long lease names.
+ * @param selfHealingSettings Settings for the self-healing mechanism that automatically removes stale shard state
+ *   when cluster coordination is impaired.
  */
 final class ClusterShardingSettings(
     val role: Option[String],
@@ -1189,8 +1283,40 @@ final class ClusterShardingSettings(
     val tuningParameters: ClusterShardingSettings.TuningParameters,
     val coordinatorSingletonOverrideRole: Boolean,
     val coordinatorSingletonSettings: ClusterSingletonManagerSettings,
-    val leaseSettings: Option[LeaseUsageSettings])
+    val leaseSettings: Option[LeaseUsageSettings],
+    val selfHealingSettings: ClusterShardingSettings.SelfHealingSettings)
     extends NoSerializationVerificationNeeded {
+  @deprecated(
+    "Use the ClusterShardingSettings factory methods or the constructor including selfHealingSettings instead",
+    "2.10.0")
+  def this(
+      role: Option[String],
+      rememberEntities: Boolean,
+      journalPluginId: String,
+      snapshotPluginId: String,
+      stateStoreMode: String,
+      rememberEntitiesStore: String,
+      passivationStrategySettings: ClusterShardingSettings.PassivationStrategySettings,
+      shardRegionQueryTimeout: FiniteDuration,
+      tuningParameters: ClusterShardingSettings.TuningParameters,
+      coordinatorSingletonOverrideRole: Boolean,
+      coordinatorSingletonSettings: ClusterSingletonManagerSettings,
+      leaseSettings: Option[LeaseUsageSettings]) =
+    this(
+      role,
+      rememberEntities,
+      journalPluginId,
+      snapshotPluginId,
+      stateStoreMode,
+      rememberEntitiesStore,
+      passivationStrategySettings,
+      shardRegionQueryTimeout,
+      tuningParameters,
+      coordinatorSingletonOverrideRole,
+      coordinatorSingletonSettings,
+      leaseSettings,
+      ClusterShardingSettings.SelfHealingSettings.disabled)
+
   @deprecated(
     "Use the ClusterShardingSettings factory methods or the constructor including coordinatorSingletonOverrideRole instead",
     "2.6.20")
@@ -1218,7 +1344,8 @@ final class ClusterShardingSettings(
       tuningParameters,
       coordinatorSingletonOverrideRole = true,
       coordinatorSingletonSettings,
-      leaseSettings)
+      leaseSettings,
+      ClusterShardingSettings.SelfHealingSettings.disabled)
 
   @deprecated(
     "Use the ClusterShardingSettings factory methods or the constructor including passivationStrategySettings instead",
@@ -1390,6 +1517,13 @@ final class ClusterShardingSettings(
       coordinatorSingletonSettings: ClusterSingletonManagerSettings): ClusterShardingSettings =
     copy(coordinatorSingletonSettings = coordinatorSingletonSettings)
 
+  /**
+   * Configure the self-healing mechanism for automatic shard state cleanup.
+   */
+  def withSelfHealingSettings(
+      selfHealingSettings: ClusterShardingSettings.SelfHealingSettings): ClusterShardingSettings =
+    copy(selfHealingSettings = selfHealingSettings)
+
   private def copy(
       role: Option[String] = role,
       rememberEntities: Boolean = rememberEntities,
@@ -1401,7 +1535,8 @@ final class ClusterShardingSettings(
       tuningParameters: ClusterShardingSettings.TuningParameters = tuningParameters,
       coordinatorSingletonOverrideRole: Boolean = coordinatorSingletonOverrideRole,
       coordinatorSingletonSettings: ClusterSingletonManagerSettings = coordinatorSingletonSettings,
-      leaseSettings: Option[LeaseUsageSettings] = leaseSettings): ClusterShardingSettings =
+      leaseSettings: Option[LeaseUsageSettings] = leaseSettings,
+      selfHealingSettings: ClusterShardingSettings.SelfHealingSettings = selfHealingSettings): ClusterShardingSettings =
     new ClusterShardingSettings(
       role,
       rememberEntities,
@@ -1414,5 +1549,6 @@ final class ClusterShardingSettings(
       tuningParameters,
       coordinatorSingletonOverrideRole,
       coordinatorSingletonSettings,
-      leaseSettings)
+      leaseSettings,
+      selfHealingSettings)
 }