diff --git a/server/src/main/java/org/apache/druid/server/coordinator/balancer/DiskNormalizedCostBalancerStrategy.java b/server/src/main/java/org/apache/druid/server/coordinator/balancer/DiskNormalizedCostBalancerStrategy.java index e8b1b902dde6..815bf8799512 100644 --- a/server/src/main/java/org/apache/druid/server/coordinator/balancer/DiskNormalizedCostBalancerStrategy.java +++ b/server/src/main/java/org/apache/druid/server/coordinator/balancer/DiskNormalizedCostBalancerStrategy.java @@ -26,21 +26,22 @@ /** * A {@link BalancerStrategy} which normalizes the cost of placing a segment on a - * server as calculated by {@link CostBalancerStrategy} by multiplying it by the - * server's disk usage ratio. + * server as calculated by {@link CostBalancerStrategy} by dividing by the + * server's available disk headroom. *
- * normalizedCost = cost * usageRatio + * normalizedCost = cost / max(EPSILON, 1 - usageRatio) * where usageRatio = diskUsed / totalDiskSpace *- * This penalizes servers that are more full, driving disk utilization to equalize - * across the tier. When all servers have equal disk usage, the behavior is identical - * to {@link CostBalancerStrategy}. When historicals have different disk capacities, - * this naturally accounts for both fill level and total capacity. + * The denominator diverges as a server approaches full, so disk fullness has + * more weight over the placement decision when servers are nearly full, + * regardless of asymmetries in the locality cost. {@link #EPSILON} is a small + * numerical floor on the divisor to guard against division by zero (or by + * negative values during in-flight loads). *
- * To prevent oscillation when servers have similar utilization, any server that
+ * To prevent oscillation when servers have similar headroom, any server that
* is already projected to hold the segment (the source on a move, or a currently
* serving node on a drop) receives a cost discount equal to
- * {@link #DEFAULT_MOVE_COST_SAVINGS_THRESHOLD}. A move therefore fires only when
+ * {@link DiskNormalizedCostBalancerStrategyConfig.DEFAULT_MOVE_COST_SAVINGS_THRESHOLD}. A move therefore fires only when
* the destination saves at least this fraction of the source's cost. The default
* is configurable via
* {@code druid.coordinator.balancer.diskNormalized.moveCostSavingsThreshold}.
@@ -48,18 +49,17 @@
public class DiskNormalizedCostBalancerStrategy extends CostBalancerStrategy
{
/**
- * Default minimum fractional cost reduction required before a segment will
- * be moved off a server that is already projected to hold it. A value of
- * {@code 0.05} means the destination must be at least 5% cheaper than the
- * source for the move to happen.
+ * Numerical floor on the headroom divisor to prevent division by zero or by
+ * negative values when {@code usageRatio >= 1.0} (possible for over-allocated
+ * servers or during in-flight loads).
*/
- static final double DEFAULT_MOVE_COST_SAVINGS_THRESHOLD = 0.05;
+ static final double EPSILON = 1e-6;
private final double sourceCostMultiplier;
public DiskNormalizedCostBalancerStrategy(ListeningExecutorService exec)
{
- this(exec, DEFAULT_MOVE_COST_SAVINGS_THRESHOLD);
+ this(exec, DiskNormalizedCostBalancerStrategyConfig.DEFAULT_MOVE_COST_SAVINGS_THRESHOLD);
}
public DiskNormalizedCostBalancerStrategy(ListeningExecutorService exec, double moveCostSavingsThreshold)
@@ -85,17 +85,16 @@ protected double computePlacementCost(
return cost;
}
- // Guard against NaN propagation in the cost comparator if a server
- // somehow reports a non-positive maxSize. Such a server cannot hold
- // anything and will be rejected by canLoadSegment, so returning the
- // raw cost is safe.
+ // A server with non-positive maxSize cannot hold anything and will be
+ // rejected by canLoadSegment; return the raw cost to avoid NaN propagation.
final long maxSize = server.getMaxSize();
if (maxSize <= 0) {
return cost;
}
- double usageRatio = (double) server.getSizeUsed() / maxSize;
- double normalizedCost = cost * usageRatio;
+ final double usageRatio = (double) server.getSizeUsed() / maxSize;
+ final double headroom = Math.max(EPSILON, 1.0 - usageRatio);
+ double normalizedCost = cost / headroom;
if (server.isProjectedSegment(proposalSegment)) {
normalizedCost *= sourceCostMultiplier;
diff --git a/server/src/main/java/org/apache/druid/server/coordinator/balancer/DiskNormalizedCostBalancerStrategyConfig.java b/server/src/main/java/org/apache/druid/server/coordinator/balancer/DiskNormalizedCostBalancerStrategyConfig.java
index 95680e7e7dd6..1219eddc8ab8 100644
--- a/server/src/main/java/org/apache/druid/server/coordinator/balancer/DiskNormalizedCostBalancerStrategyConfig.java
+++ b/server/src/main/java/org/apache/druid/server/coordinator/balancer/DiskNormalizedCostBalancerStrategyConfig.java
@@ -34,6 +34,14 @@
*/
public class DiskNormalizedCostBalancerStrategyConfig
{
+ /**
+ * Default minimum fractional cost reduction required before a segment will
+ * be moved off a server that is already projected to hold it. A value of
+ * {@code 0.05} means the destination must be at least 5% cheaper than the
+ * source for the move to happen.
+ */
+ static final double DEFAULT_MOVE_COST_SAVINGS_THRESHOLD = 0.05;
+
/**
* Minimum fractional cost reduction required to move a segment off a server
* that is already projected to hold it. For example, a value of {@code 0.05} means the
@@ -53,7 +61,7 @@ public DiskNormalizedCostBalancerStrategyConfig(
@JsonProperty("moveCostSavingsThreshold") @Nullable Double moveCostSavingsThreshold
)
{
- this.moveCostSavingsThreshold = Configs.valueOrDefault(moveCostSavingsThreshold, DiskNormalizedCostBalancerStrategy.DEFAULT_MOVE_COST_SAVINGS_THRESHOLD);
+ this.moveCostSavingsThreshold = Configs.valueOrDefault(moveCostSavingsThreshold, DEFAULT_MOVE_COST_SAVINGS_THRESHOLD);
Preconditions.checkArgument(
this.moveCostSavingsThreshold >= 0.0 && this.moveCostSavingsThreshold < 1.0,
diff --git a/server/src/test/java/org/apache/druid/server/coordinator/balancer/DiskNormalizedCostBalancerStrategyTest.java b/server/src/test/java/org/apache/druid/server/coordinator/balancer/DiskNormalizedCostBalancerStrategyTest.java
index f57199c1f48c..9df33be4d6cf 100644
--- a/server/src/test/java/org/apache/druid/server/coordinator/balancer/DiskNormalizedCostBalancerStrategyTest.java
+++ b/server/src/test/java/org/apache/druid/server/coordinator/balancer/DiskNormalizedCostBalancerStrategyTest.java
@@ -304,6 +304,63 @@ public void testThresholdBlocksMarginalMove()
Assert.assertEquals("DEST", movedTo.getServer().getName());
}
+ @Test
+ public void testNearFullServerIsNotChosenForNewSegmentLoad()
+ {
+ final long maxSize = 10_000_000L;
+ // A: 95% full, 5 same-DS DAY segments -> raw cost = 10 * K (low, few co-located segs)
+ final ServerHolder nearFull = buildServer("A", maxSize, 9_500_000L, 0, 5);
+ // B: 70% full, 20 same-DS DAY segments -> raw cost = 40 * K (higher, more co-located)
+ final ServerHolder partial = buildServer("B", maxSize, 7_000_000L, 100, 20);
+
+ final DataSegment newSegment = getSegment(1000);
+ final List