Skip to content

Commit 191e80c

Browse files
authored
Merge pull request #1491 from openzim/measure-resources
measure task cpu and disk percentage
2 parents bb10b21 + b3a9a34 commit 191e80c

File tree

4 files changed

+173
-18
lines changed

4 files changed

+173
-18
lines changed

backend/src/zimfarm_backend/common/schemas/orms.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -113,14 +113,18 @@ class TaskContainerProgressSchema(BaseModel):
113113
total: int | None = None
114114

115115

116-
class TaskContainerMemoryStatsSchema(BaseModel):
117-
max_usage: int | None = None
116+
class TaskResourceUsageSchema(BaseModel):
117+
max_usage: int | float | None = Field(default=None, alias="max")
118118

119119

120-
class TaskContainerStatsSchema(BaseModel):
121-
memory: TaskContainerMemoryStatsSchema = Field(
122-
default_factory=TaskContainerMemoryStatsSchema
123-
)
120+
class TaskCPUUsageSchema(TaskResourceUsageSchema):
121+
avg_usage: float | None = Field(default=None, alias="avg")
122+
123+
124+
class TaskStatsSchema(BaseModel):
125+
memory: TaskResourceUsageSchema = Field(default_factory=TaskResourceUsageSchema)
126+
cpu: TaskCPUUsageSchema = Field(default_factory=TaskCPUUsageSchema)
127+
disk: TaskResourceUsageSchema = Field(default_factory=TaskResourceUsageSchema)
124128

125129

126130
class TaskContainerSchema(BaseModel):
@@ -130,7 +134,7 @@ class TaskContainerSchema(BaseModel):
130134

131135
log: str | None = None
132136
image: str | None = None
133-
stats: dict[str, Any] | None = None
137+
stats: TaskStatsSchema | None = None
134138
artifacts: str | None = None
135139
stderr: str | None = None
136140
stdout: str | None = None

frontend-ui/src/types/tasks.ts

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,15 @@ export interface TaskFile {
3232
info?: TaskFileInfo
3333
}
3434

35+
export interface TaskResourceUsage {
36+
max?: number | null
37+
}
38+
39+
export interface TaskCPUUsage {
40+
max?: number
41+
avg?: number
42+
}
43+
3544
export interface TaskContainer {
3645
command: string[]
3746
exit_code?: number
@@ -46,9 +55,9 @@ export interface TaskContainer {
4655
total: number
4756
} | null
4857
stats?: {
49-
memory?: {
50-
max_usage?: number | null
51-
}
58+
memory?: TaskResourceUsage
59+
disk?: TaskResourceUsage
60+
cpu?: TaskCPUUsage
5261
}
5362
}
5463

frontend-ui/src/views/TaskDetailView.vue

Lines changed: 73 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -324,13 +324,55 @@
324324
<code class="text-pink-accent-2">{{ taskContainer.exit_code }}</code>
325325
</td>
326326
</tr>
327-
<tr v-if="maxMemory != null">
327+
<tr v-if="hasStats">
328328
<th class="text-left w-20">Stats</th>
329329
<td>
330-
<v-chip size="small" class="mr-2">
331-
<v-icon size="small" class="mr-1">mdi-memory</v-icon>
332-
{{ maxMemory }} (max)
333-
</v-chip>
330+
<div class="d-flex flex-wrap ga-2">
331+
<v-tooltip
332+
v-if="maxMemory"
333+
text="Maximum memory used during task execution"
334+
>
335+
<template #activator="{ props }">
336+
<v-chip v-bind="props" size="small">
337+
<v-icon size="small" class="mr-1">mdi-memory</v-icon>
338+
{{ maxMemory }} (max)
339+
</v-chip>
340+
</template>
341+
</v-tooltip>
342+
<v-tooltip
343+
v-if="maxDisk"
344+
text="Maximum disk space used during task execution"
345+
>
346+
<template #activator="{ props }">
347+
<v-chip v-bind="props" size="small">
348+
<v-icon size="small" class="mr-1">mdi-harddisk</v-icon>
349+
{{ maxDisk }} (max)
350+
</v-chip>
351+
</template>
352+
</v-tooltip>
353+
<v-tooltip
354+
v-if="hasCpuStats && cpuStats && cpuStats.max !== null"
355+
text="Maximum CPU usage percentage during task execution"
356+
>
357+
<template #activator="{ props }">
358+
<v-chip v-bind="props" size="small">
359+
<v-icon size="small" class="mr-1">mdi-cpu-64-bit</v-icon>
360+
{{ cpuStats.max.toFixed(1) }}% (max)
361+
</v-chip>
362+
</template>
363+
</v-tooltip>
364+
<v-tooltip
365+
v-if="hasCpuStats && cpuStats && cpuStats.avg !== null"
366+
text="Average CPU usage percentage during task execution"
367+
>
368+
<template #activator="{ props }">
369+
<v-chip v-bind="props" size="small">
370+
<v-icon size="small" class="mr-1">mdi-chart-line</v-icon>
371+
{{ cpuStats.avg.toFixed(1) }}% (avg)
372+
</v-chip>
373+
</template>
374+
</v-tooltip>
375+
</div>
334376
</td>
335377
</tr>
336378
<tr v-if="taskProgress">
@@ -578,12 +620,37 @@ const canCancel = computed(() => {
578620
579621
const maxMemory = computed(() => {
580622
try {
581-
return formattedBytesSize(taskContainer.value?.stats?.memory?.max_usage || 0)
623+
return formattedBytesSize(taskContainer.value?.stats?.memory?.max || 0)
624+
} catch {
625+
return null
626+
}
627+
})
628+
629+
const maxDisk = computed(() => {
630+
try {
631+
return formattedBytesSize(taskContainer.value?.stats?.disk?.max || 0)
582632
} catch {
583633
return null
584634
}
585635
})
586636
637+
const cpuStats = computed(() => {
638+
const stats = taskContainer.value?.stats?.cpu
639+
if (!stats) return null
640+
return {
641+
max: stats.max ?? null,
642+
avg: stats.avg ?? null,
643+
}
644+
})
645+
646+
const hasCpuStats = computed(() => {
647+
return cpuStats.value && (cpuStats.value.max !== null || cpuStats.value.avg !== null)
648+
})
649+
650+
const hasStats = computed(() => {
651+
return maxMemory.value || maxDisk.value || hasCpuStats.value
652+
})
653+
587654
const monitoringUrl = computed(() => {
588655
return `http://monitoring.openzim.org/host/${scheduleName.value}_${shortId.value}.${
589656
task.value?.worker_name

worker/src/zimfarm_worker/task/worker.py

Lines changed: 77 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,9 @@
4040
from zimfarm_worker.task.zim import get_zim_info
4141

4242
SLEEP_INTERVAL = 60 # nb of seconds to sleep before watching
43+
CPU_EWMA_ALPHA = 0.01 # EWMA smoothing factor for CPU percentage samples (0..1)
44+
45+
4346
PENDING = "pending"
4447
DOING = "doing"
4548
DONE = "done"
@@ -132,6 +135,9 @@ def __init__(
132135
self.scraper_succeeded: bool | None = None # whether scraper succeeded
133136

134137
self.max_memory_usage: int = 0 # maximum memory used by scraper
138+
self.max_disk_usage: int = 0 # maximum disk used by scraper
139+
self.avg_cpu_usage: float = 0.0 # cpu exponential moving weighted average
140+
self.max_cpu_usage: float = 0.0 # maximum cpu percentage used by scraper
135141

136142
# register stop/^C
137143
self.register_signals()
@@ -192,8 +198,62 @@ def mark_scraper_completed(self, exit_code: int, stdout: str, stderr: str):
192198
}
193199
)
194200

201+
def _get_scraper_disk_usage(self) -> int:
202+
"""
203+
Get disk usage of scraper container's task workdir in bytes.
204+
205+
Calculates the actual disk space used by files in the scraper's
206+
task workdir (where ZIM files and other outputs are written).
207+
"""
208+
if not self.task_workdir:
209+
return 0
210+
211+
try:
212+
if self.task_workdir.exists() and self.task_workdir.is_dir():
213+
return sum(
214+
f.stat().st_size
215+
for f in self.task_workdir.rglob("*")
216+
if f.is_file()
217+
)
218+
return 0
219+
except Exception:
220+
logger.exception("Failed to get scraper disk usage")
221+
return 0
222+
223+
def _compute_scraper_cpu_stats(self, scraper_stats: dict[str, Any]) -> float:
224+
"""
225+
Compute CPU usage statistics from scraper container stats.
226+
227+
Calculates CPU percentage with EWMA smoothing to reduce effect of
228+
short spikes.
229+
"""
230+
cpu_sample = 0.0
231+
cpu_stats = scraper_stats.get("cpu_stats", {})
232+
precpu_stats = scraper_stats.get("precpu_stats", {})
233+
prev_total = precpu_stats.get("cpu_usage", {}).get("total_usage", 0)
234+
curr_total = cpu_stats.get("cpu_usage", {}).get("total_usage", 0)
235+
prev_system = precpu_stats.get("system_cpu_usage", 0)
236+
curr_system = cpu_stats.get("system_cpu_usage", 0)
237+
238+
delta_cpu = curr_total - prev_total
239+
delta_system = curr_system - prev_system
240+
online_cpus = cpu_stats.get("online_cpus", 0)
241+
242+
if delta_system > 0 and delta_cpu >= 0 and online_cpus > 0:
243+
cpu_sample = (delta_cpu / float(delta_system)) * float(online_cpus) * 100.0
244+
245+
# apply EWMA smoothing to reduce effect of short spikes
246+
if self.avg_cpu_usage == 0.0:
247+
self.avg_cpu_usage = cpu_sample
248+
else:
249+
self.avg_cpu_usage = (
250+
CPU_EWMA_ALPHA * cpu_sample
251+
+ (1.0 - CPU_EWMA_ALPHA) * self.avg_cpu_usage
252+
)
253+
return cpu_sample
254+
195255
def submit_scraper_progress(self):
196-
"""report last lines of scraper to the API"""
256+
"""report scraper statistics and logs to the API"""
197257
if not self.scraper:
198258
logger.error("No scraper to update")
199259
return
@@ -204,17 +264,32 @@ def submit_scraper_progress(self):
204264
stream=False
205265
)
206266
scraper_stats = cast(dict[str, Any], scraper_stats)
267+
207268
# update statistics
208269
self.max_memory_usage = max(
209270
[
210271
scraper_stats.get("memory_stats", {}).get("usage", 0),
211272
self.max_memory_usage,
212273
]
213274
)
275+
276+
cpu_sample = self._compute_scraper_cpu_stats(scraper_stats)
277+
self.max_cpu_usage = max([cpu_sample, self.max_cpu_usage])
278+
279+
disk_usage = self._get_scraper_disk_usage()
280+
self.max_disk_usage = max([disk_usage, self.max_disk_usage])
281+
214282
stats: dict[str, Any] = {
215283
"memory": {
216284
"max_usage": self.max_memory_usage,
217-
}
285+
},
286+
"cpu": {
287+
"max_usage": self.max_cpu_usage,
288+
"avg_usage": round(self.avg_cpu_usage, 2),
289+
},
290+
"disk": {
291+
"max_usage": self.max_disk_usage,
292+
},
218293
}
219294

220295
# fetch and compute progression from progress file

0 commit comments

Comments
 (0)