Skip to content

Commit 425e8ab

Browse files
committed
[Iris] Add sections to the UI for task stats
The new UI sections show up for the task that is the Zephyr controller. Not sure if we want to bubble them up so the job view for the job that runs the single Zephyr controller task.
1 parent de2c3d6 commit 425e8ab

8 files changed

Lines changed: 393 additions & 111 deletions

File tree

lib/iris/dashboard/src/components/controller/TaskDetail.vue

Lines changed: 91 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,10 @@ import { useAutoRefresh } from '@/composables/useAutoRefresh'
66
import { stateToName } from '@/types/status'
77
import type {
88
TaskStatus,
9+
TaskStatsSnapshot,
910
GetTaskStatusResponse,
1011
} from '@/types/rpc'
11-
import { timestampMs, formatBytes, formatCpuMillicores, formatDuration, formatRelativeTime } from '@/utils/formatting'
12+
import { timestampMs, formatBytes, formatRate, formatCpuMillicores, formatDuration, formatRelativeTime } from '@/utils/formatting'
1213
1314
import { controllerRpcCall } from '@/composables/useRpc'
1415
import { useProfileAction } from '@/composables/useProfileAction'
@@ -77,6 +78,45 @@ const memHistory = computed(() =>
7778
(task.value?.resourceHistory ?? []).map(r => r.memoryMb ? parseFloat(r.memoryMb) : 0)
7879
)
7980
81+
const statsHistory = computed(() => task.value?.taskStatsHistory ?? [])
82+
83+
const latestStats = computed(() => statsHistory.value[statsHistory.value.length - 1] ?? null)
84+
85+
// Compute per-second throughput from cumulative deltas, then smooth with a
86+
// 5-point centred moving average to reduce reporting-interval spikes.
87+
function throughputSeries(field: 'itemsProcessed' | 'bytesProcessed'): number[] {
88+
const snaps = statsHistory.value
89+
if (snaps.length < 2) return []
90+
const raw: number[] = []
91+
for (let i = 1; i < snaps.length; i++) {
92+
const prev = snaps[i - 1]
93+
const curr = snaps[i]
94+
const dtMs = parseFloat(curr.timestampMs ?? '0') - parseFloat(prev.timestampMs ?? '0')
95+
if (dtMs <= 0) continue
96+
const dv = parseFloat(curr[field] ?? '0') - parseFloat(prev[field] ?? '0')
97+
raw.push(Math.max(0, dv / (dtMs / 1000)))
98+
}
99+
const window = 5
100+
return raw.map((_, i) => {
101+
const lo = Math.max(0, i - Math.floor(window / 2))
102+
const hi = Math.min(raw.length, lo + window)
103+
const slice = raw.slice(lo, hi)
104+
return slice.reduce((a, b) => a + b, 0) / slice.length
105+
})
106+
}
107+
108+
const itemsThroughput = computed(() => throughputSeries('itemsProcessed'))
109+
const bytesThroughput = computed(() => throughputSeries('bytesProcessed'))
110+
111+
const itemsThroughputMax = computed(() => {
112+
const m = Math.max(...itemsThroughput.value, 0)
113+
return m > 0 ? m.toFixed(1) + '/s' : '0/s'
114+
})
115+
const bytesThroughputMax = computed(() => {
116+
const m = Math.max(...bytesThroughput.value, 0)
117+
return m > 0 ? formatRate(m) : '0 B/s'
118+
})
119+
80120
// Use job-level resource limits for gauge totals when available.
81121
const cpuTotal = computed(() => {
82122
const jobCpu = (jobResources.value?.cpuMillicores ?? 0) / 1000
@@ -276,16 +316,64 @@ watch(() => props.taskId, async () => {
276316
<div v-if="cpuHistory.length > 1" class="grid grid-cols-2 gap-4 mb-6">
277317
<div class="rounded-lg border border-surface-border bg-surface p-3">
278318
<div class="text-xs text-text-secondary mb-2">CPU %</div>
279-
<Sparkline :data="cpuHistory" :width="200" :height="40" color="var(--color-accent, #2563eb)" />
319+
<Sparkline :data="cpuHistory" :width="200" :height="40" fill color="var(--color-accent, #2563eb)" />
280320
<div class="text-xs font-mono text-text-muted mt-1">{{ cpuUsed.toFixed(0) }}%</div>
281321
</div>
282322
<div class="rounded-lg border border-surface-border bg-surface p-3">
283323
<div class="text-xs text-text-secondary mb-2">Memory (MB)</div>
284-
<Sparkline :data="memHistory" :width="200" :height="40" color="var(--color-status-purple, #8b5cf6)" />
324+
<Sparkline :data="memHistory" :width="200" :height="40" fill color="var(--color-status-purple, #8b5cf6)" />
285325
<div class="text-xs font-mono text-text-muted mt-1">{{ memUsedMb.toFixed(0) }} MB</div>
286326
</div>
287327
</div>
288328

329+
<!-- Task stats -->
330+
<div v-if="statsHistory.length > 0" class="grid grid-cols-2 gap-4 mb-6">
331+
<!-- Left: status text + latest values -->
332+
<div class="rounded-lg border border-surface-border bg-surface p-3 space-y-2">
333+
<div v-if="task.statusMessage" class="text-xs text-text font-mono whitespace-pre-wrap break-all">{{ task.statusMessage }}</div>
334+
<div class="text-xs text-text-muted space-y-1 mt-2">
335+
<div>Items processed: <span class="font-mono text-text">{{ latestStats ? parseInt(latestStats.itemsProcessed ?? '0').toLocaleString() : '—' }}</span></div>
336+
<div>Bytes processed: <span class="font-mono text-text">{{ latestStats ? formatBytes(parseFloat(latestStats.bytesProcessed ?? '0')) : '—' }}</span></div>
337+
</div>
338+
</div>
339+
340+
<!-- Right: throughput charts -->
341+
<div class="space-y-3">
342+
<div class="rounded-lg border border-surface-border bg-surface p-3">
343+
<div class="text-xs text-text-secondary mb-2">Items / sec</div>
344+
<Sparkline
345+
v-if="itemsThroughput.length > 1"
346+
:data="itemsThroughput"
347+
:width="200"
348+
:height="40"
349+
fill
350+
color="var(--color-accent, #2563eb)"
351+
show-y-axis
352+
:y-axis-top-label="itemsThroughputMax"
353+
/>
354+
<div class="text-xs font-mono text-text-muted mt-1">
355+
{{ itemsThroughput.length ? itemsThroughput[itemsThroughput.length - 1].toFixed(1) + ' items/s' : '—' }}
356+
</div>
357+
</div>
358+
<div class="rounded-lg border border-surface-border bg-surface p-3">
359+
<div class="text-xs text-text-secondary mb-2">Bytes / sec</div>
360+
<Sparkline
361+
v-if="bytesThroughput.length > 1"
362+
:data="bytesThroughput"
363+
:width="200"
364+
:height="40"
365+
fill
366+
color="var(--color-status-purple, #8b5cf6)"
367+
show-y-axis
368+
:y-axis-top-label="bytesThroughputMax"
369+
/>
370+
<div class="text-xs font-mono text-text-muted mt-1">
371+
{{ bytesThroughput.length ? formatRate(bytesThroughput[bytesThroughput.length - 1]) : '—' }}
372+
</div>
373+
</div>
374+
</div>
375+
</div>
376+
289377
<!-- Error display -->
290378
<div
291379
v-if="task.error"

lib/iris/dashboard/src/components/shared/Sparkline.vue

Lines changed: 70 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,32 @@ const props = withDefaults(defineProps<{
77
width?: number
88
height?: number
99
fillColor?: string
10+
/** Stretch the SVG to 100% of its container width. */
11+
fill?: boolean
12+
/** Show a max-value line and label on the right. */
13+
showYAxis?: boolean
14+
/** Label for the max tick. If omitted, the raw max value is shown. */
15+
yAxisTopLabel?: string
1016
}>(), {
1117
color: 'var(--color-accent, #2563eb)',
1218
width: 64,
1319
height: 20,
1420
})
1521
1622
const PAD = 1
23+
/** Extra top padding reserved for the y-axis label. */
24+
const LABEL_H = 11
25+
26+
const topPad = computed(() => props.showYAxis ? PAD + LABEL_H : PAD)
27+
28+
const dataMax = computed(() => {
29+
if (!props.data || props.data.length < 1) return 0
30+
return Math.max(...props.data)
31+
})
32+
33+
const topLabel = computed(() =>
34+
props.yAxisTopLabel ?? dataMax.value.toFixed(1)
35+
)
1736
1837
/**
1938
* Build the SVG polyline points string from the data array.
@@ -24,16 +43,15 @@ const points = computed(() => {
2443
if (!props.data || props.data.length < 1) return ''
2544
2645
const data = props.data.length === 1 ? [props.data[0], props.data[0]] : props.data
27-
const max = Math.max(...data)
28-
46+
const max = dataMax.value
2947
const innerW = props.width - 2 * PAD
30-
const innerH = props.height - 2 * PAD
48+
const innerH = props.height - topPad.value - PAD
3149
3250
return data.map((v, i) => {
3351
const x = PAD + (i / (data.length - 1)) * innerW
3452
const y = max === 0
35-
? PAD + innerH
36-
: PAD + innerH - (Math.min(v, max) / max) * innerH
53+
? topPad.value + innerH
54+
: topPad.value + innerH - (Math.min(v, max) / max) * innerH
3755
return `${x.toFixed(1)},${y.toFixed(1)}`
3856
}).join(' ')
3957
})
@@ -42,36 +60,60 @@ const points = computed(() => {
4260
const areaPoints = computed(() => {
4361
if (!points.value) return ''
4462
const innerW = props.width - 2 * PAD
45-
const innerH = props.height - 2 * PAD
46-
const bottomRight = `${(PAD + innerW).toFixed(1)},${(PAD + innerH).toFixed(1)}`
47-
const bottomLeft = `${PAD.toFixed(1)},${(PAD + innerH).toFixed(1)}`
63+
const innerH = props.height - topPad.value - PAD
64+
const bottomRight = `${(PAD + innerW).toFixed(1)},${(topPad.value + innerH).toFixed(1)}`
65+
const bottomLeft = `${PAD.toFixed(1)},${(topPad.value + innerH).toFixed(1)}`
4866
return `${points.value} ${bottomRight} ${bottomLeft}`
4967
})
5068
5169
const hasData = computed(() => props.data && props.data.length >= 1)
5270
</script>
5371

5472
<template>
55-
<svg
73+
<div
5674
v-if="hasData"
57-
class="sparkline"
58-
:width="width"
59-
:height="height"
60-
:viewBox="`0 0 ${width} ${height}`"
61-
preserveAspectRatio="none"
75+
class="relative"
76+
:class="fill ? 'w-full' : 'inline-block'"
77+
:style="fill ? undefined : { width: `${width}px`, height: `${height}px` }"
6278
>
63-
<polygon
64-
v-if="fillColor"
65-
:points="areaPoints"
66-
:fill="fillColor"
67-
/>
68-
<polyline
69-
fill="none"
70-
:stroke="color"
71-
stroke-width="1.5"
72-
stroke-linecap="round"
73-
stroke-linejoin="round"
74-
:points="points"
75-
/>
76-
</svg>
79+
<svg
80+
class="sparkline block"
81+
:class="fill ? 'w-full' : ''"
82+
:width="fill ? undefined : width"
83+
:height="height"
84+
:viewBox="`0 0 ${width} ${height}`"
85+
preserveAspectRatio="none"
86+
>
87+
<!-- Y-axis: thin line across the top of the data area -->
88+
<line
89+
v-if="showYAxis"
90+
:x1="PAD" :y1="topPad"
91+
:x2="width - PAD" :y2="topPad"
92+
stroke="currentColor"
93+
stroke-width="0.5"
94+
opacity="0.25"
95+
/>
96+
97+
<polygon
98+
v-if="fillColor"
99+
:points="areaPoints"
100+
:fill="fillColor"
101+
/>
102+
<polyline
103+
fill="none"
104+
:stroke="color"
105+
stroke-width="1.5"
106+
stroke-linecap="round"
107+
stroke-linejoin="round"
108+
:points="points"
109+
/>
110+
</svg>
111+
112+
<!-- Label rendered as HTML so it isn't stretched by the SVG scaling -->
113+
<span
114+
v-if="showYAxis"
115+
class="absolute right-1 text-[9px] opacity-60 leading-none pointer-events-none"
116+
:style="{ top: `${PAD}px` }"
117+
>{{ topLabel }}</span>
118+
</div>
77119
</template>

lib/iris/dashboard/src/types/rpc.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,12 @@ export interface TaskAttempt {
7777
isWorkerFailure?: boolean
7878
}
7979

80+
export interface TaskStatsSnapshot {
81+
itemsProcessed?: string
82+
bytesProcessed?: string
83+
timestampMs?: string
84+
}
85+
8086
export interface TaskStatus {
8187
taskId: string
8288
state: string
@@ -95,6 +101,8 @@ export interface TaskStatus {
95101
canBeScheduled?: boolean
96102
containerId?: string
97103
resourceHistory?: ResourceUsage[]
104+
statusMessage?: string
105+
taskStatsHistory?: TaskStatsSnapshot[]
98106
}
99107

100108
// -- Jobs --

lib/iris/src/iris/cluster/controller/service.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,8 @@ def task_to_proto(task: TaskDetailRow, worker_address: str = "") -> job_pb2.Task
251251
proto.finished_at.CopyFrom(timestamp_to_proto(current_attempt.finished_at))
252252
if task.container_id:
253253
proto.container_id = task.container_id
254+
if task.status_message:
255+
proto.status_message = task.status_message
254256
# For pending tasks with prior terminal attempts, surface retry context.
255257
if task.state == job_pb2.TASK_STATE_PENDING and task.attempts and task.attempts[-1].state in TERMINAL_TASK_STATES:
256258
last = task.attempts[-1]
@@ -1494,6 +1496,12 @@ def get_task_status(
14941496
"WHERE trh.task_id = ? AND trh.attempt_id = ? ORDER BY trh.id DESC LIMIT ?",
14951497
(task_id.to_wire(), task.current_attempt_id, TASK_RESOURCE_HISTORY_RETENTION),
14961498
)
1499+
stats_rows = q.raw(
1500+
"SELECT tsh.items_processed, tsh.bytes_processed, tsh.timestamp_ms "
1501+
"FROM task_stats_history tsh "
1502+
"WHERE tsh.task_id = ? ORDER BY tsh.id DESC LIMIT ?",
1503+
(task_id.to_wire(), TASK_RESOURCE_HISTORY_RETENTION),
1504+
)
14971505
jc_row = q.raw(
14981506
"SELECT jc.res_cpu_millicores, jc.res_memory_bytes, jc.res_disk_bytes, jc.res_device_json "
14991507
"FROM job_config jc WHERE jc.job_id = ?",
@@ -1509,6 +1517,15 @@ def get_task_status(
15091517
memory_peak_mb=r.memory_peak_mb,
15101518
)
15111519
)
1520+
for r in reversed(stats_rows):
1521+
proto.task_stats_history.append(
1522+
job_pb2.TaskStatsSnapshot(
1523+
items_processed=r.items_processed,
1524+
bytes_processed=r.bytes_processed,
1525+
timestamp_ms=r.timestamp_ms,
1526+
)
1527+
)
1528+
15121529
# Populate resource_usage from the latest history entry (newest is first before reversal).
15131530
if history_rows:
15141531
latest = history_rows[0]

lib/iris/src/iris/rpc/job.proto

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,18 @@ message TaskStatus {
234234
string container_id = 19; // Platform container identifier (Docker container ID, K8s pod name)
235235
// Historical resource snapshots (oldest first). Populated only in GetTaskStatus detail view.
236236
repeated ResourceUsage resource_history = 20;
237+
238+
// Human-readable status string reported by the task (e.g. Zephyr stage progress).
239+
string status_message = 21;
240+
241+
// Historical stats snapshots (oldest first). Populated only in GetTaskStatus detail view.
242+
repeated TaskStatsSnapshot task_stats_history = 22;
243+
}
244+
245+
message TaskStatsSnapshot {
246+
int64 items_processed = 1;
247+
int64 bytes_processed = 2;
248+
int64 timestamp_ms = 3;
237249
}
238250

239251
// Record of a single task execution attempt

lib/iris/src/iris/rpc/job_pb2.py

Lines changed: 80 additions & 78 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

lib/iris/src/iris/rpc/job_pb2.pyi

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -341,7 +341,7 @@ class GetProcessStatusResponse(_message.Message):
341341
def __init__(self, process_info: _Optional[_Union[ProcessInfo, _Mapping]] = ..., log_entries: _Optional[_Iterable[_Union[_logging_pb2.LogEntry, _Mapping]]] = ...) -> None: ...
342342

343343
class TaskStatus(_message.Message):
344-
__slots__ = ("task_id", "state", "worker_id", "worker_address", "exit_code", "error", "started_at", "finished_at", "ports", "resource_usage", "build_metrics", "current_attempt_id", "attempts", "pending_reason", "can_be_scheduled", "container_id", "resource_history")
344+
__slots__ = ("task_id", "state", "worker_id", "worker_address", "exit_code", "error", "started_at", "finished_at", "ports", "resource_usage", "build_metrics", "current_attempt_id", "attempts", "pending_reason", "can_be_scheduled", "container_id", "resource_history", "status_message", "task_stats_history")
345345
class PortsEntry(_message.Message):
346346
__slots__ = ("key", "value")
347347
KEY_FIELD_NUMBER: _ClassVar[int]
@@ -366,6 +366,8 @@ class TaskStatus(_message.Message):
366366
CAN_BE_SCHEDULED_FIELD_NUMBER: _ClassVar[int]
367367
CONTAINER_ID_FIELD_NUMBER: _ClassVar[int]
368368
RESOURCE_HISTORY_FIELD_NUMBER: _ClassVar[int]
369+
STATUS_MESSAGE_FIELD_NUMBER: _ClassVar[int]
370+
TASK_STATS_HISTORY_FIELD_NUMBER: _ClassVar[int]
369371
task_id: str
370372
state: TaskState
371373
worker_id: str
@@ -383,7 +385,19 @@ class TaskStatus(_message.Message):
383385
can_be_scheduled: bool
384386
container_id: str
385387
resource_history: _containers.RepeatedCompositeFieldContainer[ResourceUsage]
386-
def __init__(self, task_id: _Optional[str] = ..., state: _Optional[_Union[TaskState, str]] = ..., worker_id: _Optional[str] = ..., worker_address: _Optional[str] = ..., exit_code: _Optional[int] = ..., error: _Optional[str] = ..., started_at: _Optional[_Union[_time_pb2.Timestamp, _Mapping]] = ..., finished_at: _Optional[_Union[_time_pb2.Timestamp, _Mapping]] = ..., ports: _Optional[_Mapping[str, int]] = ..., resource_usage: _Optional[_Union[ResourceUsage, _Mapping]] = ..., build_metrics: _Optional[_Union[BuildMetrics, _Mapping]] = ..., current_attempt_id: _Optional[int] = ..., attempts: _Optional[_Iterable[_Union[TaskAttempt, _Mapping]]] = ..., pending_reason: _Optional[str] = ..., can_be_scheduled: _Optional[bool] = ..., container_id: _Optional[str] = ..., resource_history: _Optional[_Iterable[_Union[ResourceUsage, _Mapping]]] = ...) -> None: ...
388+
status_message: str
389+
task_stats_history: _containers.RepeatedCompositeFieldContainer[TaskStatsSnapshot]
390+
def __init__(self, task_id: _Optional[str] = ..., state: _Optional[_Union[TaskState, str]] = ..., worker_id: _Optional[str] = ..., worker_address: _Optional[str] = ..., exit_code: _Optional[int] = ..., error: _Optional[str] = ..., started_at: _Optional[_Union[_time_pb2.Timestamp, _Mapping]] = ..., finished_at: _Optional[_Union[_time_pb2.Timestamp, _Mapping]] = ..., ports: _Optional[_Mapping[str, int]] = ..., resource_usage: _Optional[_Union[ResourceUsage, _Mapping]] = ..., build_metrics: _Optional[_Union[BuildMetrics, _Mapping]] = ..., current_attempt_id: _Optional[int] = ..., attempts: _Optional[_Iterable[_Union[TaskAttempt, _Mapping]]] = ..., pending_reason: _Optional[str] = ..., can_be_scheduled: _Optional[bool] = ..., container_id: _Optional[str] = ..., resource_history: _Optional[_Iterable[_Union[ResourceUsage, _Mapping]]] = ..., status_message: _Optional[str] = ..., task_stats_history: _Optional[_Iterable[_Union[TaskStatsSnapshot, _Mapping]]] = ...) -> None: ...
391+
392+
class TaskStatsSnapshot(_message.Message):
393+
__slots__ = ("items_processed", "bytes_processed", "timestamp_ms")
394+
ITEMS_PROCESSED_FIELD_NUMBER: _ClassVar[int]
395+
BYTES_PROCESSED_FIELD_NUMBER: _ClassVar[int]
396+
TIMESTAMP_MS_FIELD_NUMBER: _ClassVar[int]
397+
items_processed: int
398+
bytes_processed: int
399+
timestamp_ms: int
400+
def __init__(self, items_processed: _Optional[int] = ..., bytes_processed: _Optional[int] = ..., timestamp_ms: _Optional[int] = ...) -> None: ...
387401

388402
class TaskAttempt(_message.Message):
389403
__slots__ = ("attempt_id", "worker_id", "state", "exit_code", "error", "started_at", "finished_at", "is_worker_failure")

0 commit comments

Comments
 (0)