Skip to content

Commit 3993ac2

Browse files
authored
Add exec and invocation durations (#4372)
Adds two new fields: - `exec_duration` - The duration of the execution, **if measured**. Excludes data validation, container pulling, model downloading, data downloading and data uploading times. Includes model loading time, input data loading time, processing time, output data writing time and **any delays from shared hardware issues**. - `invoke_duration` - The duration of the execution, **if measured**. Excludes data validation, container pulling, model downloading, data downloading and data uploading times. **Potentially excludes model loading time, depending on the users implementation**. Includes input data loading time, processing time, output data writing time and **any delays from shared hardware issues**. - This will not be set for now and is for future use, it is there for challenge admins to get used to the fact that there are two fields. One, both or neither will be set. Neither will be set for existing jobs. Setting and serializing these values will follow in a future PR after all of the container images are upgraded. Note: There is also `billing_duration` which includes everything, but we might have to change the implementation of that if we go to batch jobs so this will not be serialized for now. See DIAGNijmegen/rse-roadmap#441
1 parent 87877da commit 3993ac2

9 files changed

Lines changed: 143 additions & 20 deletions

File tree

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# Generated by Django 4.2.25 on 2025-10-22 12:51
2+
3+
from django.db import migrations, models
4+
5+
6+
class Migration(migrations.Migration):
7+
8+
dependencies = [
9+
("algorithms", "0083_alter_job_signing_key"),
10+
]
11+
12+
operations = [
13+
migrations.AddField(
14+
model_name="job",
15+
name="exec_duration",
16+
field=models.DurationField(
17+
default=None,
18+
editable=False,
19+
help_text="The duration of the execution, if measured. Excludes data validation, container pulling, model downloading, data downloading and data uploading times. Includes model loading time, input data loading time, processing time, output data writing time and any delays from shared hardware issues.",
20+
null=True,
21+
),
22+
),
23+
migrations.AddField(
24+
model_name="job",
25+
name="invoke_duration",
26+
field=models.DurationField(
27+
default=None,
28+
editable=False,
29+
help_text="The duration of the invocation, if measured. Excludes data validation, container pulling, model downloading, data downloading and data uploading times. Potentially excludes model loading time, depending on the users implementation. Includes input data loading time, processing time, output data writing time and any delays from shared hardware issues.",
30+
null=True,
31+
),
32+
),
33+
]

app/grandchallenge/components/backends/amazon_sagemaker_training.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ def _create_job_boto(self):
105105
},
106106
StoppingCondition={
107107
# https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_StoppingCondition.html
108-
"MaxRuntimeInSeconds": self._time_limit,
108+
"MaxRuntimeInSeconds": int(self._time_limit.total_seconds()),
109109
},
110110
Environment={
111111
**self.invocation_environment,

app/grandchallenge/components/backends/base.py

Lines changed: 38 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import os
1010
import secrets
1111
from abc import ABC, abstractmethod
12+
from datetime import timedelta
1213
from json import JSONDecodeError
1314
from math import ceil
1415
from pathlib import Path
@@ -30,6 +31,8 @@
3031
from django.utils._os import safe_join
3132
from django.utils.functional import cached_property
3233
from panimg.image_builders import image_builder_mhd, image_builder_tiff
34+
from pydantic import BaseModel, ConfigDict
35+
from pydantic_core import to_json
3336

3437
from grandchallenge.cases.tasks import import_images
3538
from grandchallenge.components.backends.exceptions import (
@@ -266,6 +269,25 @@ async def s3_stream_response(
266269
raise
267270

268271

272+
class InferenceIO(BaseModel):
273+
model_config = ConfigDict(frozen=True)
274+
275+
relative_path: str
276+
bucket_name: str
277+
bucket_key: str
278+
decompress: bool
279+
280+
281+
class InferenceTask(BaseModel):
282+
model_config = ConfigDict(frozen=True)
283+
284+
pk: str
285+
inputs: list[InferenceIO]
286+
output_bucket_name: str
287+
output_prefix: str
288+
timeout: timedelta
289+
290+
269291
class Executor(ABC):
270292
def __init__(
271293
self,
@@ -285,7 +307,7 @@ def __init__(
285307
self._job_id = job_id
286308
self._exec_image_repo_tag = exec_image_repo_tag
287309
self._memory_limit = memory_limit
288-
self._time_limit = time_limit
310+
self._time_limit = timedelta(seconds=time_limit)
289311
self._requires_gpu_type = requires_gpu_type
290312
self._use_warm_pool = (
291313
use_warm_pool and settings.COMPONENTS_USE_WARM_POOL
@@ -540,16 +562,16 @@ def _get_provisioning_tasks(self, *, input_civs, input_prefixes):
540562
):
541563
provisioning_tasks.append(civ_provisioning_task.task)
542564
invocation_inputs.append(
543-
{
544-
"relative_path": str(
565+
InferenceIO(
566+
relative_path=str(
545567
os.path.relpath(
546568
civ_provisioning_task.key, self._io_prefix
547569
)
548570
),
549-
"bucket_name": settings.COMPONENTS_INPUT_BUCKET_NAME,
550-
"bucket_key": civ_provisioning_task.key,
551-
"decompress": civ.decompress,
552-
}
571+
bucket_name=settings.COMPONENTS_INPUT_BUCKET_NAME,
572+
bucket_key=civ_provisioning_task.key,
573+
decompress=civ.decompress,
574+
)
553575
)
554576

555577
provisioning_tasks.append(
@@ -666,16 +688,17 @@ def _get_civ_provisioning_tasks(self, *, civ, input_prefixes):
666688

667689
def _get_create_invocation_json_task(self, *, invocation_inputs):
668690
return self._get_upload_input_content_task(
669-
content=json.dumps(
691+
content=to_json(
670692
[
671-
{
672-
"pk": self._job_id,
673-
"inputs": invocation_inputs,
674-
"output_bucket_name": settings.COMPONENTS_OUTPUT_BUCKET_NAME,
675-
"output_prefix": self._io_prefix,
676-
}
693+
InferenceTask(
694+
pk=self._job_id,
695+
inputs=invocation_inputs,
696+
output_bucket_name=settings.COMPONENTS_OUTPUT_BUCKET_NAME,
697+
output_prefix=self._io_prefix,
698+
timeout=self._time_limit,
699+
)
677700
]
678-
).encode("utf-8"),
701+
),
679702
key=self._invocation_key,
680703
)
681704

app/grandchallenge/components/models.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1619,6 +1619,34 @@ class ComponentJob(FieldChangeMixin, UUIDModel):
16191619
attempt = models.PositiveSmallIntegerField(editable=False, default=0)
16201620
stdout = models.TextField()
16211621
stderr = models.TextField(default="")
1622+
exec_duration = models.DurationField(
1623+
null=True,
1624+
default=None,
1625+
editable=False,
1626+
help_text=(
1627+
"The duration of the execution, if measured. "
1628+
"Excludes data validation, container pulling, model downloading, "
1629+
"data downloading and data uploading times. "
1630+
"Includes model loading time, input data loading time, "
1631+
"processing time, output data writing time and "
1632+
"any delays from shared hardware issues."
1633+
),
1634+
)
1635+
invoke_duration = models.DurationField(
1636+
null=True,
1637+
default=None,
1638+
editable=False,
1639+
help_text=(
1640+
"The duration of the invocation, if measured. "
1641+
"Excludes data validation, container pulling, model downloading, "
1642+
"data downloading and data uploading times. "
1643+
"Potentially excludes model loading time, depending on the "
1644+
"users implementation. "
1645+
"Includes input data loading time, "
1646+
"processing time, output data writing time and "
1647+
"any delays from shared hardware issues."
1648+
),
1649+
)
16221650
runtime_metrics = models.JSONField(default=dict, editable=False)
16231651
error_message = models.CharField(max_length=1024, default="")
16241652
detailed_error_message = models.JSONField(blank=True, default=dict)
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# Generated by Django 4.2.25 on 2025-10-22 12:51
2+
3+
from django.db import migrations, models
4+
5+
6+
class Migration(migrations.Migration):
7+
8+
dependencies = [
9+
("evaluation", "0100_alter_evaluation_signing_key"),
10+
]
11+
12+
operations = [
13+
migrations.AddField(
14+
model_name="evaluation",
15+
name="exec_duration",
16+
field=models.DurationField(
17+
default=None,
18+
editable=False,
19+
help_text="The duration of the execution, if measured. Excludes data validation, container pulling, model downloading, data downloading and data uploading times. Includes model loading time, input data loading time, processing time, output data writing time and any delays from shared hardware issues.",
20+
null=True,
21+
),
22+
),
23+
migrations.AddField(
24+
model_name="evaluation",
25+
name="invoke_duration",
26+
field=models.DurationField(
27+
default=None,
28+
editable=False,
29+
help_text="The duration of the invocation, if measured. Excludes data validation, container pulling, model downloading, data downloading and data uploading times. Potentially excludes model loading time, depending on the users implementation. Includes input data loading time, processing time, output data writing time and any delays from shared hardware issues.",
30+
null=True,
31+
),
32+
),
33+
]

app/tests/components_tests/test_amazon_sagemaker_training_backend.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ def test_invocation_prefix():
133133
),
134134
)
135135
def test_transform_job_name(model, container, container_model, key):
136-
j = model(pk=uuid4())
136+
j = model(pk=uuid4(), time_limit=60)
137137
setattr(j, container, container_model(pk=uuid4()))
138138
executor = AmazonSageMakerTrainingExecutor(**j.executor_kwargs)
139139

@@ -169,7 +169,7 @@ def test_invocation_json(settings):
169169
time_limit=60,
170170
requires_gpu_type=GPUTypeChoices.NO_GPU,
171171
use_warm_pool=False,
172-
signing_key=b"",
172+
signing_key=b"totallysecret",
173173
)
174174

175175
with Stubber(executor._sagemaker_client) as s:
@@ -205,6 +205,7 @@ def test_invocation_json(settings):
205205
"GRAND_CHALLENGE_COMPONENT_WRITABLE_DIRECTORIES": "/opt/ml/output/data:/opt/ml/model:/opt/ml/input/data/ground_truth:/opt/ml/checkpoints:/tmp",
206206
"GRAND_CHALLENGE_COMPONENT_POST_CLEAN_DIRECTORIES": "/opt/ml/output/data:/opt/ml/model:/opt/ml/input/data/ground_truth",
207207
"GRAND_CHALLENGE_COMPONENT_MAX_MEMORY_MB": "7168",
208+
"GRAND_CHALLENGE_COMPONENT_SIGNING_KEY_HEX": "746f74616c6c79736563726574",
208209
},
209210
"VpcConfig": {
210211
"SecurityGroupIds": [
@@ -216,6 +217,7 @@ def test_invocation_json(settings):
216217
},
217218
)
218219
executor.provision(input_civs=[], input_prefixes={})
220+
executor.execute() # Required to validate expected_params in the stubber
219221

220222
with io.BytesIO() as fileobj:
221223
executor._s3_client.download_fileobj(
@@ -239,6 +241,7 @@ def test_invocation_json(settings):
239241
"output_bucket_name": "grand-challenge-components-outputs",
240242
"output_prefix": f"/io/algorithms/job/{pk}",
241243
"pk": f"algorithms-job-{pk}",
244+
"timeout": "PT1M",
242245
}
243246
]
244247

app/tests/components_tests/test_backends.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,7 @@ def test_invocation_json(settings):
337337
"output_bucket_name": "grand-challenge-components-outputs",
338338
"output_prefix": f"/io/test/test/{job_pk}",
339339
"pk": f"test-test-{job_pk}",
340+
"timeout": "PT1M40S",
340341
},
341342
]
342343

@@ -667,6 +668,7 @@ def test_dicom_get_provisioning_tasks():
667668
"output_bucket_name": "grand-challenge-components-outputs",
668669
"output_prefix": f"/io/test/test/{job_pk}",
669670
"pk": f"test-test-{job_pk}",
671+
"timeout": "PT1M40S",
670672
},
671673
]
672674

dockerfiles/web-base/Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ RUN mkdir -p /opt/docker \
7878

7979
ENV PYTHONUNBUFFERED=1\
8080
AWS_XRAY_SDK_ENABLED=false\
81-
COMPONENTS_SAGEMAKER_SHIM_VERSION=0.5.0\
81+
COMPONENTS_SAGEMAKER_SHIM_VERSION=0.6.0\
8282
PATH="/opt/grand-challenge/.venv/bin:/home/django/.local/bin:${PATH}"
8383

8484
RUN mkdir -p /opt/grand-challenge /app /static /opt/sagemaker-shim /opt/virtualenvs \
@@ -90,7 +90,7 @@ USER django:django
9090
# Fetch and install sagemaker shim for shimming containers
9191
RUN mkdir -p /opt/sagemaker-shim \
9292
&& wget "https://github.com/DIAGNijmegen/rse-sagemaker-shim/releases/download/v${COMPONENTS_SAGEMAKER_SHIM_VERSION}/sagemaker-shim-${COMPONENTS_SAGEMAKER_SHIM_VERSION}-Linux-x86_64.tar.gz" -P /opt/sagemaker-shim/ \
93-
&& echo "e0fe6715342c706a14feb0be9a4343c9cf66efa83a14131b9bf25a8b189d50c6 /opt/sagemaker-shim/sagemaker-shim-${COMPONENTS_SAGEMAKER_SHIM_VERSION}-Linux-x86_64.tar.gz" | shasum -c - || exit 1 \
93+
&& echo "a3c33f65ee72e039dd90c9d7a3460de33abdd020f530164b8666975a1ae5e192 /opt/sagemaker-shim/sagemaker-shim-${COMPONENTS_SAGEMAKER_SHIM_VERSION}-Linux-x86_64.tar.gz" | shasum -c - || exit 1 \
9494
&& tar -C /opt/sagemaker-shim/ -xzvf "/opt/sagemaker-shim/sagemaker-shim-${COMPONENTS_SAGEMAKER_SHIM_VERSION}-Linux-x86_64.tar.gz" \
9595
&& rm "/opt/sagemaker-shim/sagemaker-shim-${COMPONENTS_SAGEMAKER_SHIM_VERSION}-Linux-x86_64.tar.gz"
9696

uv.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)