Skip to content

Commit fe0d0d3

Browse files
committed
fix: adjust diskNormalized strategy to exponentially penalize cost based on disk utilization
1 parent ad106a8 commit fe0d0d3

3 files changed

Lines changed: 86 additions & 22 deletions

File tree

server/src/main/java/org/apache/druid/server/coordinator/balancer/DiskNormalizedCostBalancerStrategy.java

Lines changed: 20 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -26,40 +26,40 @@
2626

2727
/**
2828
* A {@link BalancerStrategy} which normalizes the cost of placing a segment on a
29-
* server as calculated by {@link CostBalancerStrategy} by multiplying it by the
30-
* server's disk usage ratio.
29+
* server as calculated by {@link CostBalancerStrategy} by dividing by the
30+
* server's available disk headroom.
3131
* <pre>
32-
* normalizedCost = cost * usageRatio
32+
* normalizedCost = cost / max(EPSILON, 1 - usageRatio)
3333
* where usageRatio = diskUsed / totalDiskSpace
3434
* </pre>
35-
* This penalizes servers that are more full, driving disk utilization to equalize
36-
* across the tier. When all servers have equal disk usage, the behavior is identical
37-
* to {@link CostBalancerStrategy}. When historicals have different disk capacities,
38-
* this naturally accounts for both fill level and total capacity.
35+
* The denominator diverges as a server approaches full, so disk fullness has
36+
* more weight over the placement decision when servers are nearly full,
37+
* regardless of asymmetries in the locality cost. {@link #EPSILON} is a small
38+
* numerical floor on the divisor to guard against division by zero (or by
39+
* negative values during in-flight loads).
3940
* <p>
40-
* To prevent oscillation when servers have similar utilization, any server that
41+
* To prevent oscillation when servers have similar headroom, any server that
4142
* is already projected to hold the segment (the source on a move, or a currently
4243
* serving node on a drop) receives a cost discount equal to
43-
* {@link #DEFAULT_MOVE_COST_SAVINGS_THRESHOLD}. A move therefore fires only when
44+
* {@link DiskNormalizedCostBalancerStrategyConfig.DEFAULT_MOVE_COST_SAVINGS_THRESHOLD}. A move therefore fires only when
4445
* the destination saves at least this fraction of the source's cost. The default
4546
* is configurable via
4647
* {@code druid.coordinator.balancer.diskNormalized.moveCostSavingsThreshold}.
4748
*/
4849
public class DiskNormalizedCostBalancerStrategy extends CostBalancerStrategy
4950
{
5051
/**
51-
* Default minimum fractional cost reduction required before a segment will
52-
* be moved off a server that is already projected to hold it. A value of
53-
* {@code 0.05} means the destination must be at least 5% cheaper than the
54-
* source for the move to happen.
52+
* Numerical floor on the headroom divisor to prevent division by zero or by
53+
* negative values when {@code usageRatio >= 1.0} (possible for over-allocated
54+
* servers or during in-flight loads).
5555
*/
56-
static final double DEFAULT_MOVE_COST_SAVINGS_THRESHOLD = 0.05;
56+
static final double EPSILON = 1e-6;
5757

5858
private final double sourceCostMultiplier;
5959

6060
public DiskNormalizedCostBalancerStrategy(ListeningExecutorService exec)
6161
{
62-
this(exec, DEFAULT_MOVE_COST_SAVINGS_THRESHOLD);
62+
this(exec, DiskNormalizedCostBalancerStrategyConfig.DEFAULT_MOVE_COST_SAVINGS_THRESHOLD);
6363
}
6464

6565
public DiskNormalizedCostBalancerStrategy(ListeningExecutorService exec, double moveCostSavingsThreshold)
@@ -85,17 +85,16 @@ protected double computePlacementCost(
8585
return cost;
8686
}
8787

88-
// Guard against NaN propagation in the cost comparator if a server
89-
// somehow reports a non-positive maxSize. Such a server cannot hold
90-
// anything and will be rejected by canLoadSegment, so returning the
91-
// raw cost is safe.
88+
// A server with non-positive maxSize cannot hold anything and will be
89+
// rejected by canLoadSegment; return the raw cost to avoid NaN propagation.
9290
final long maxSize = server.getMaxSize();
9391
if (maxSize <= 0) {
9492
return cost;
9593
}
9694

97-
double usageRatio = (double) server.getSizeUsed() / maxSize;
98-
double normalizedCost = cost * usageRatio;
95+
final double usageRatio = (double) server.getSizeUsed() / maxSize;
96+
final double headroom = Math.max(EPSILON, 1.0 - usageRatio);
97+
double normalizedCost = cost / headroom;
9998

10099
if (server.isProjectedSegment(proposalSegment)) {
101100
normalizedCost *= sourceCostMultiplier;

server/src/main/java/org/apache/druid/server/coordinator/balancer/DiskNormalizedCostBalancerStrategyConfig.java

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,14 @@
3434
*/
3535
public class DiskNormalizedCostBalancerStrategyConfig
3636
{
37+
/**
38+
* Default minimum fractional cost reduction required before a segment will
39+
* be moved off a server that is already projected to hold it. A value of
40+
* {@code 0.05} means the destination must be at least 5% cheaper than the
41+
* source for the move to happen.
42+
*/
43+
static final double DEFAULT_MOVE_COST_SAVINGS_THRESHOLD = 0.05;
44+
3745
/**
3846
* Minimum fractional cost reduction required to move a segment off a server
3947
* that is already projected to hold it. For example, a value of {@code 0.05} means the
@@ -53,7 +61,7 @@ public DiskNormalizedCostBalancerStrategyConfig(
5361
@JsonProperty("moveCostSavingsThreshold") @Nullable Double moveCostSavingsThreshold
5462
)
5563
{
56-
this.moveCostSavingsThreshold = Configs.valueOrDefault(moveCostSavingsThreshold, DiskNormalizedCostBalancerStrategy.DEFAULT_MOVE_COST_SAVINGS_THRESHOLD);
64+
this.moveCostSavingsThreshold = Configs.valueOrDefault(moveCostSavingsThreshold, DEFAULT_MOVE_COST_SAVINGS_THRESHOLD);
5765

5866
Preconditions.checkArgument(
5967
this.moveCostSavingsThreshold >= 0.0 && this.moveCostSavingsThreshold < 1.0,

server/src/test/java/org/apache/druid/server/coordinator/balancer/DiskNormalizedCostBalancerStrategyTest.java

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,63 @@ public void testThresholdBlocksMarginalMove()
304304
Assert.assertEquals("DEST", movedTo.getServer().getName());
305305
}
306306

307+
@Test
308+
public void testNearFullServerIsNotChosenForNewSegmentLoad()
309+
{
310+
final long maxSize = 10_000_000L;
311+
// A: 95% full, 5 same-DS DAY segments -> raw cost = 10 * K (low, few co-located segs)
312+
final ServerHolder nearFull = buildServer("A", maxSize, 9_500_000L, 0, 5);
313+
// B: 70% full, 20 same-DS DAY segments -> raw cost = 40 * K (higher, more co-located)
314+
final ServerHolder partial = buildServer("B", maxSize, 7_000_000L, 100, 20);
315+
316+
final DataSegment newSegment = getSegment(1000);
317+
final List<ServerHolder> servers = new ArrayList<>();
318+
servers.add(nearFull);
319+
servers.add(partial);
320+
321+
// CostBalancerStrategy picks A because raw cost 10K < 40K.
322+
Assert.assertEquals(
323+
"Pure CostBalancerStrategy must pick the near-full server (lower raw cost)",
324+
"A",
325+
newCostStrategy().findServersToLoadSegment(newSegment, servers).next().getServer().getName()
326+
);
327+
328+
// DiskNormalized: A_norm = 10K / 0.05 = 200K, B_norm = 40K / 0.30 = 133K -> B wins.
329+
Assert.assertEquals(
330+
"DiskNormalized must prefer the emptier server despite its higher raw cost",
331+
"B",
332+
newDiskNormalizedStrategy().findServersToLoadSegment(newSegment, servers).next().getServer().getName()
333+
);
334+
}
335+
336+
@Test
337+
public void testNearFullServerIsNotChosenAsMoveDestination()
338+
{
339+
final long maxSize = 10_000_000L;
340+
// SOURCE: 70% full, 20 same-DS DAY segments; segmentToMove is one of them.
341+
final ServerHolder source = buildServer("SOURCE", maxSize, 7_000_000L, 0, 20);
342+
// DEST: 95% full, 5 same-DS DAY segments -> raw cost 10K < SOURCE's 38K.
343+
final ServerHolder nearFullDest = buildServer("DEST", maxSize, 9_500_000L, 100, 5);
344+
345+
final DataSegment segmentToMove = getSegment(0);
346+
final List<ServerHolder> servers = new ArrayList<>();
347+
servers.add(source);
348+
servers.add(nearFullDest);
349+
350+
// CostBalancerStrategy: DEST raw cost (10K) < SOURCE raw cost (38K) -> recommends the move.
351+
final ServerHolder costResult =
352+
newCostStrategy().findDestinationServerToMoveSegment(segmentToMove, source, servers);
353+
Assert.assertNotNull("CostBalancerStrategy must recommend moving to the near-full DEST", costResult);
354+
Assert.assertEquals("DEST", costResult.getServer().getName());
355+
356+
// DiskNormalized: DEST_norm = 10K / 0.05 = 200K > SOURCE_norm = 38K / 0.30 * 0.95 ≈ 120K.
357+
// Near-full DEST is too expensive after normalization -> no move.
358+
Assert.assertNull(
359+
"DiskNormalized must block the move to the near-full server",
360+
newDiskNormalizedStrategy().findDestinationServerToMoveSegment(segmentToMove, source, servers)
361+
);
362+
}
363+
307364
@Test
308365
public void testRejectsInvalidThreshold()
309366
{

0 commit comments

Comments
 (0)