Skip to content

Commit b3521f9

Browse files
committed
fix: adjust diskNormalized strategy to exponentially penalize cost based on disk utilization
1 parent ad106a8 commit b3521f9

2 files changed

Lines changed: 95 additions & 14 deletions

File tree

server/src/main/java/org/apache/druid/server/coordinator/balancer/DiskNormalizedCostBalancerStrategy.java

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -26,18 +26,19 @@
2626

2727
/**
2828
* A {@link BalancerStrategy} which normalizes the cost of placing a segment on a
29-
* server as calculated by {@link CostBalancerStrategy} by multiplying it by the
30-
* server's disk usage ratio.
29+
* server as calculated by {@link CostBalancerStrategy} by dividing by the
30+
* server's available disk headroom.
3131
* <pre>
32-
* normalizedCost = cost * usageRatio
32+
* normalizedCost = cost / max(EPSILON, 1 - usageRatio)
3333
* where usageRatio = diskUsed / totalDiskSpace
3434
* </pre>
35-
* This penalizes servers that are more full, driving disk utilization to equalize
36-
* across the tier. When all servers have equal disk usage, the behavior is identical
37-
* to {@link CostBalancerStrategy}. When historicals have different disk capacities,
38-
* this naturally accounts for both fill level and total capacity.
35+
* The denominator diverges as a server approaches full, so disk fullness has
36+
* more weight over the placement decision when servers are nearly full,
37+
* regardless of asymmetries in the locality cost. {@link #EPSILON} is a small
38+
* numerical floor on the divisor to guard against division by zero (or by
39+
* negative values during in-flight loads).
3940
* <p>
40-
* To prevent oscillation when servers have similar utilization, any server that
41+
* To prevent oscillation when servers have similar headroom, any server that
4142
* is already projected to hold the segment (the source on a move, or a currently
4243
* serving node on a drop) receives a cost discount equal to
4344
* {@link #DEFAULT_MOVE_COST_SAVINGS_THRESHOLD}. A move therefore fires only when
@@ -55,6 +56,13 @@ public class DiskNormalizedCostBalancerStrategy extends CostBalancerStrategy
5556
*/
5657
static final double DEFAULT_MOVE_COST_SAVINGS_THRESHOLD = 0.05;
5758

59+
/**
60+
* Numerical floor on the headroom divisor to prevent division by zero or by
61+
* negative values when {@code usageRatio >= 1.0} (possible for over-allocated
62+
* servers or during in-flight loads).
63+
*/
64+
static final double EPSILON = 1e-6;
65+
5866
private final double sourceCostMultiplier;
5967

6068
public DiskNormalizedCostBalancerStrategy(ListeningExecutorService exec)
@@ -85,17 +93,16 @@ protected double computePlacementCost(
8593
return cost;
8694
}
8795

88-
// Guard against NaN propagation in the cost comparator if a server
89-
// somehow reports a non-positive maxSize. Such a server cannot hold
90-
// anything and will be rejected by canLoadSegment, so returning the
91-
// raw cost is safe.
96+
// A server with non-positive maxSize cannot hold anything and will be
97+
// rejected by canLoadSegment; return the raw cost to avoid NaN propagation.
9298
final long maxSize = server.getMaxSize();
9399
if (maxSize <= 0) {
94100
return cost;
95101
}
96102

97-
double usageRatio = (double) server.getSizeUsed() / maxSize;
98-
double normalizedCost = cost * usageRatio;
103+
final double usageRatio = (double) server.getSizeUsed() / maxSize;
104+
final double headroom = Math.max(EPSILON, 1.0 - usageRatio);
105+
double normalizedCost = cost / headroom;
99106

100107
if (server.isProjectedSegment(proposalSegment)) {
101108
normalizedCost *= sourceCostMultiplier;

server/src/test/java/org/apache/druid/server/coordinator/balancer/DiskNormalizedCostBalancerStrategyTest.java

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,80 @@ public void testThresholdBlocksMarginalMove()
304304
Assert.assertEquals("DEST", movedTo.getServer().getName());
305305
}
306306

307+
/**
308+
* Regression: when one server is near-full (~95%) and another is moderately
309+
* used (~70%), the near-full server can still have a lower <em>raw</em>
310+
* locality cost if it holds fewer co-located segments. The old
311+
* {@link CostBalancerStrategy} would happily send new segments there.
312+
* {@link DiskNormalizedCostBalancerStrategy} must prefer the emptier server
313+
* because dividing by the smaller headroom (0.05 vs 0.30) makes the
314+
* near-full server's normalized cost far higher.
315+
*/
316+
@Test
317+
public void testNearFullServerIsNotChosenForNewSegmentLoad()
318+
{
319+
final long maxSize = 10_000_000L;
320+
// A: 95% full, 5 same-DS DAY segments -> raw cost = 10 * K (low, few co-located segs)
321+
final ServerHolder nearFull = buildServer("A", maxSize, 9_500_000L, 0, 5);
322+
// B: 70% full, 20 same-DS DAY segments -> raw cost = 40 * K (higher, more co-located)
323+
final ServerHolder partial = buildServer("B", maxSize, 7_000_000L, 100, 20);
324+
325+
final DataSegment newSegment = getSegment(1000);
326+
final List<ServerHolder> servers = new ArrayList<>();
327+
servers.add(nearFull);
328+
servers.add(partial);
329+
330+
// CostBalancerStrategy picks A because raw cost 10K < 40K.
331+
Assert.assertEquals(
332+
"Pure CostBalancerStrategy must pick the near-full server (lower raw cost)",
333+
"A",
334+
newCostStrategy().findServersToLoadSegment(newSegment, servers).next().getServer().getName()
335+
);
336+
337+
// DiskNormalized: A_norm = 10K / 0.05 = 200K, B_norm = 40K / 0.30 = 133K -> B wins.
338+
Assert.assertEquals(
339+
"DiskNormalized must prefer the emptier server despite its higher raw cost",
340+
"B",
341+
newDiskNormalizedStrategy().findServersToLoadSegment(newSegment, servers).next().getServer().getName()
342+
);
343+
}
344+
345+
/**
346+
* Regression: the source server is at 70% and the only available destination
347+
* is at 95%. The destination holds fewer co-located segments so its raw
348+
* locality cost is lower — the old balancer would initiate the move.
349+
* {@link DiskNormalizedCostBalancerStrategy} must block it because the
350+
* near-full destination's normalized cost (10K / 0.05 = 200K) exceeds the
351+
* source's discounted cost (38K / 0.30 * 0.95 ≈ 120K).
352+
*/
353+
@Test
354+
public void testNearFullServerIsNotChosenAsMoveDestination()
355+
{
356+
final long maxSize = 10_000_000L;
357+
// SOURCE: 70% full, 20 same-DS DAY segments; segmentToMove is one of them.
358+
final ServerHolder source = buildServer("SOURCE", maxSize, 7_000_000L, 0, 20);
359+
// DEST: 95% full, 5 same-DS DAY segments -> raw cost 10K < SOURCE's 38K.
360+
final ServerHolder nearFullDest = buildServer("DEST", maxSize, 9_500_000L, 100, 5);
361+
362+
final DataSegment segmentToMove = getSegment(0);
363+
final List<ServerHolder> servers = new ArrayList<>();
364+
servers.add(source);
365+
servers.add(nearFullDest);
366+
367+
// CostBalancerStrategy: DEST raw cost (10K) < SOURCE raw cost (38K) -> recommends the move.
368+
final ServerHolder costResult =
369+
newCostStrategy().findDestinationServerToMoveSegment(segmentToMove, source, servers);
370+
Assert.assertNotNull("CostBalancerStrategy must recommend moving to the near-full DEST", costResult);
371+
Assert.assertEquals("DEST", costResult.getServer().getName());
372+
373+
// DiskNormalized: DEST_norm = 10K / 0.05 = 200K > SOURCE_norm = 38K / 0.30 * 0.95 ≈ 120K.
374+
// Near-full DEST is too expensive after normalization -> no move.
375+
Assert.assertNull(
376+
"DiskNormalized must block the move to the near-full server",
377+
newDiskNormalizedStrategy().findDestinationServerToMoveSegment(segmentToMove, source, servers)
378+
);
379+
}
380+
307381
@Test
308382
public void testRejectsInvalidThreshold()
309383
{

0 commit comments

Comments
 (0)