Skip to content

Commit c083790

Browse files
authored
Change memory bars color on spilling/paused status (#6959)
1 parent 4cf9baf commit c083790

3 files changed

Lines changed: 131 additions & 11 deletions

File tree

distributed/dashboard/components/scheduler.py

Lines changed: 76 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@
6464
parse_timedelta,
6565
)
6666

67+
from distributed.core import Status
6768
from distributed.dashboard.components import add_periodic_callback
6869
from distributed.dashboard.components.shared import (
6970
DashboardComponent,
@@ -239,20 +240,63 @@ def update(self):
239240
self.source.data.update({"left": x[:-1], "right": x[1:], "top": counts})
240241

241242

242-
def _memory_color(current: int, limit: int) -> str:
243-
"""Dynamic color used by WorkersMemory and ClusterMemory"""
244-
if limit and current > limit:
245-
return "red"
246-
if limit and current > limit / 2:
247-
return "orange"
248-
return "blue"
243+
class MemoryColor:
244+
"""Change the color of the memory bars from blue to orange when process memory goes
245+
above the ``target`` threshold and to red when the worker pauses.
246+
Workers in ``closing_gracefully`` state will also be orange.
249247
248+
If ``target`` is disabled, change to orange on ``spill`` instead.
249+
If spilling is completely disabled, never turn orange.
250250
251-
class ClusterMemory(DashboardComponent):
251+
If pausing is disabled, change to red when passing the ``terminate`` threshold
252+
instead. If both pause and terminate are disabled, turn red when passing
253+
``memory_limit``.
254+
255+
Note
256+
----
257+
A worker will start spilling when managed memory alone passes the target threshold.
258+
However, here we're switching to orange when the process memory goes beyond target,
259+
which is usually earlier.
260+
This is deliberate for the sake of simplicity and also because, when the process
261+
memory passes the spill threshold, it will keep spilling until it falls below the
262+
target threshold - so it's not completely wrong. Again, we don't want to track
263+
the hysteresis cycle of the spill system here for the sake of simplicity.
264+
265+
In short, orange should be treated as "the worker *may* be spilling".
266+
"""
267+
268+
orange: float
269+
red: float
270+
271+
def __init__(self):
272+
target = dask.config.get("distributed.worker.memory.target")
273+
spill = dask.config.get("distributed.worker.memory.spill")
274+
terminate = dask.config.get("distributed.worker.memory.terminate")
275+
# These values can be False. It's also common to configure them to impossibly
276+
# high values to achieve the same effect.
277+
self.orange = min(target or math.inf, spill or math.inf)
278+
self.red = min(terminate or math.inf, 1.0)
279+
280+
def _memory_color(self, current: int, limit: int, status: Status) -> str:
281+
if status != Status.running:
282+
return "red"
283+
if not limit:
284+
return "blue"
285+
if current >= limit * self.red:
286+
return "red"
287+
if current >= limit * self.orange:
288+
return "orange"
289+
return "blue"
290+
291+
292+
class ClusterMemory(DashboardComponent, MemoryColor):
252293
"""Total memory usage on the cluster"""
253294

254295
@log_errors
255296
def __init__(self, scheduler, width=600, **kwargs):
297+
DashboardComponent.__init__(self)
298+
MemoryColor.__init__(self)
299+
256300
self.scheduler = scheduler
257301
self.source = ColumnDataSource(
258302
{
@@ -327,12 +371,30 @@ def __init__(self, scheduler, width=600, **kwargs):
327371
)
328372
self.root.add_tools(hover)
329373

374+
def _cluster_memory_color(self) -> str:
375+
colors = {
376+
self._memory_color(
377+
current=ws.memory.process,
378+
limit=getattr(ws, "memory_limit", 0),
379+
status=ws.status,
380+
)
381+
for ws in self.scheduler.workers.values()
382+
}
383+
384+
assert colors.issubset({"red", "orange", "blue"})
385+
if "red" in colors:
386+
return "red"
387+
elif "orange" in colors:
388+
return "orange"
389+
else:
390+
return "blue"
391+
330392
@without_property_validation
331393
@log_errors
332394
def update(self):
333395
limit = sum(ws.memory_limit for ws in self.scheduler.workers.values())
334396
meminfo = self.scheduler.memory
335-
color = _memory_color(meminfo.process, limit)
397+
color = self._cluster_memory_color()
336398

337399
width = [
338400
meminfo.managed_in_memory,
@@ -363,11 +425,14 @@ def update(self):
363425
update(self.source, result)
364426

365427

366-
class WorkersMemory(DashboardComponent):
428+
class WorkersMemory(DashboardComponent, MemoryColor):
367429
"""Memory usage for single workers"""
368430

369431
@log_errors
370432
def __init__(self, scheduler, width=600, **kwargs):
433+
DashboardComponent.__init__(self)
434+
MemoryColor.__init__(self)
435+
371436
self.scheduler = scheduler
372437
self.source = ColumnDataSource(
373438
{
@@ -477,7 +542,7 @@ def quadlist(i: Iterable[T]) -> list[T]:
477542
meminfo = ws.memory
478543
limit = getattr(ws, "memory_limit", 0)
479544
max_limit = max(max_limit, limit, meminfo.process + meminfo.managed_spilled)
480-
color_i = _memory_color(meminfo.process, limit)
545+
color_i = self._memory_color(meminfo.process, limit, ws.status)
481546

482547
width += [
483548
meminfo.managed_in_memory,

distributed/dashboard/tests/test_scheduler_bokeh.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from dask.utils import stringify
2020

2121
from distributed.client import wait
22+
from distributed.core import Status
2223
from distributed.dashboard import scheduler
2324
from distributed.dashboard.components.scheduler import (
2425
AggregateAction,
@@ -29,6 +30,7 @@
2930
Events,
3031
Hardware,
3132
MemoryByKey,
33+
MemoryColor,
3234
Occupancy,
3335
ProcessingHistogram,
3436
ProfileServer,
@@ -323,6 +325,48 @@ async def test_ClusterMemory(c, s, a, b):
323325
assert not all(d["width"])
324326

325327

328+
def test_memory_color():
329+
def config(**kwargs):
330+
return dask.config.set(
331+
{f"distributed.worker.memory.{k}": v for k, v in kwargs.items()}
332+
)
333+
334+
with config(target=0.6, spill=0.7, pause=0.8, terminate=0.95):
335+
c = MemoryColor()
336+
assert c._memory_color(50, 100, Status.running) == "blue"
337+
assert c._memory_color(60, 100, Status.running) == "orange"
338+
assert c._memory_color(75, 100, Status.running) == "orange"
339+
# Pause is not impacted by the paused threshold, but by the worker status
340+
assert c._memory_color(85, 100, Status.running) == "orange"
341+
assert c._memory_color(0, 100, Status.paused) == "red"
342+
assert c._memory_color(0, 100, Status.closing_gracefully) == "red"
343+
# Passing the terminate threshold will turn the bar red, regardless of pause
344+
assert c._memory_color(95, 100, Status.running) == "red"
345+
# Disabling memory limit disables all threshold-related color changes
346+
assert c._memory_color(100, 0, Status.running) == "blue"
347+
assert c._memory_color(100, 0, Status.paused) == "red"
348+
349+
# target disabled
350+
with config(target=False, spill=0.7):
351+
c = MemoryColor()
352+
assert c._memory_color(60, 100, Status.running) == "blue"
353+
assert c._memory_color(75, 100, Status.running) == "orange"
354+
355+
# spilling disabled
356+
with config(target=False, spill=False, pause=0.8, terminate=0.95):
357+
c = MemoryColor()
358+
assert c._memory_color(94, 100, Status.running) == "blue"
359+
assert c._memory_color(0, 100, Status.closing_gracefully) == "red"
360+
assert c._memory_color(95, 100, Status.running) == "red"
361+
362+
# terminate disabled; fall back to 100%
363+
with config(target=False, spill=False, terminate=False):
364+
c = MemoryColor()
365+
assert c._memory_color(99, 100, Status.running) == "blue"
366+
assert c._memory_color(100, 100, Status.running) == "red"
367+
assert c._memory_color(110, 100, Status.running) == "red"
368+
369+
326370
@gen_cluster(client=True)
327371
async def test_WorkersMemoryHistogram(c, s, a, b):
328372
nh = WorkersMemoryHistogram(s)

docs/source/worker-memory.rst

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,17 @@ spilled
134134
The sum of managed + unmanaged + unmanaged recent is equal by definition to the process
135135
memory.
136136

137+
The color of the bars will change as a function of memory usage too:
138+
139+
blue
140+
The worker is operating as normal
141+
orange
142+
The worker may be spilling data to disk
143+
red
144+
The worker is paused or retiring
145+
grey
146+
Data that has already been spilled to disk; this is in addition to process memory
147+
137148

138149
.. _memtrim:
139150

0 commit comments

Comments
 (0)