Add exec and invocation durations (#4372)

jmsmkn · web-flow · commit 3993ac23a7f7 · 2025-10-22T19:25:28.000+02:00
Adds two new fields: - `exec_duration` - The duration of the execution, **if measured**. Excludes data validation, container pulling, model downloading, data downloading and data uploading times. Includes model loading time, input data loading time, processing time, output data writing time and **any delays from shared hardware issues**. - `invoke_duration` - The duration of the execution, **if measured**. Excludes data validation, container pulling, model downloading, data downloading and data uploading times. **Potentially excludes model loading time, depending on the users implementation**. Includes input data loading time, processing time, output data writing time and **any delays from shared hardware issues**. - This will not be set for now and is for future use, it is there for challenge admins to get used to the fact that there are two fields. One, both or neither will be set. Neither will be set for existing jobs. Setting and serializing these values will follow in a future PR after all of the container images are upgraded. Note: There is also `billing_duration` which includes everything, but we might have to change the implementation of that if we go to batch jobs so this will not be serialized for now. See DIAGNijmegen/rse-roadmap#441
diff --git a/app/grandchallenge/algorithms/migrations/0084_job_exec_duration_job_invoke_duration.py b/app/grandchallenge/algorithms/migrations/0084_job_exec_duration_job_invoke_duration.py
@@ -0,0 +1,33 @@
+# Generated by Django 4.2.25 on 2025-10-22 12:51
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("algorithms", "0083_alter_job_signing_key"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="job",
+            name="exec_duration",
+            field=models.DurationField(
+                default=None,
+                editable=False,
+                help_text="The duration of the execution, if measured. Excludes data validation, container pulling, model downloading, data downloading and data uploading times. Includes model loading time, input data loading time, processing time, output data writing time and any delays from shared hardware issues.",
+                null=True,
+            ),
+        ),
+        migrations.AddField(
+            model_name="job",
+            name="invoke_duration",
+            field=models.DurationField(
+                default=None,
+                editable=False,
+                help_text="The duration of the invocation, if measured. Excludes data validation, container pulling, model downloading, data downloading and data uploading times. Potentially excludes model loading time, depending on the users implementation. Includes input data loading time, processing time, output data writing time and any delays from shared hardware issues.",
+                null=True,
+            ),
+        ),
+    ]
diff --git a/app/grandchallenge/components/backends/amazon_sagemaker_training.py b/app/grandchallenge/components/backends/amazon_sagemaker_training.py
@@ -105,7 +105,7 @@ def _create_job_boto(self):
             },
             StoppingCondition={
                 # https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_StoppingCondition.html
-                "MaxRuntimeInSeconds": self._time_limit,
+                "MaxRuntimeInSeconds": int(self._time_limit.total_seconds()),
             },
             Environment={
                 **self.invocation_environment,
diff --git a/app/grandchallenge/components/backends/base.py b/app/grandchallenge/components/backends/base.py
@@ -9,6 +9,7 @@
 import os
 import secrets
 from abc import ABC, abstractmethod
+from datetime import timedelta
 from json import JSONDecodeError
 from math import ceil
 from pathlib import Path
@@ -30,6 +31,8 @@
 from django.utils._os import safe_join
 from django.utils.functional import cached_property
 from panimg.image_builders import image_builder_mhd, image_builder_tiff
+from pydantic import BaseModel, ConfigDict
+from pydantic_core import to_json
 
 from grandchallenge.cases.tasks import import_images
 from grandchallenge.components.backends.exceptions import (
@@ -266,6 +269,25 @@ async def s3_stream_response(
                 raise
 
 
+class InferenceIO(BaseModel):
+    model_config = ConfigDict(frozen=True)
+
+    relative_path: str
+    bucket_name: str
+    bucket_key: str
+    decompress: bool
+
+
+class InferenceTask(BaseModel):
+    model_config = ConfigDict(frozen=True)
+
+    pk: str
+    inputs: list[InferenceIO]
+    output_bucket_name: str
+    output_prefix: str
+    timeout: timedelta
+
+
 class Executor(ABC):
     def __init__(
         self,
@@ -285,7 +307,7 @@ def __init__(
         self._job_id = job_id
         self._exec_image_repo_tag = exec_image_repo_tag
         self._memory_limit = memory_limit
-        self._time_limit = time_limit
+        self._time_limit = timedelta(seconds=time_limit)
         self._requires_gpu_type = requires_gpu_type
         self._use_warm_pool = (
             use_warm_pool and settings.COMPONENTS_USE_WARM_POOL
@@ -540,16 +562,16 @@ def _get_provisioning_tasks(self, *, input_civs, input_prefixes):
             ):
                 provisioning_tasks.append(civ_provisioning_task.task)
                 invocation_inputs.append(
-                    {
-                        "relative_path": str(
+                    InferenceIO(
+                        relative_path=str(
                             os.path.relpath(
                                 civ_provisioning_task.key, self._io_prefix
                             )
                         ),
-                        "bucket_name": settings.COMPONENTS_INPUT_BUCKET_NAME,
-                        "bucket_key": civ_provisioning_task.key,
-                        "decompress": civ.decompress,
-                    }
+                        bucket_name=settings.COMPONENTS_INPUT_BUCKET_NAME,
+                        bucket_key=civ_provisioning_task.key,
+                        decompress=civ.decompress,
+                    )
                 )
 
         provisioning_tasks.append(
@@ -666,16 +688,17 @@ def _get_civ_provisioning_tasks(self, *, civ, input_prefixes):
 
     def _get_create_invocation_json_task(self, *, invocation_inputs):
         return self._get_upload_input_content_task(
-            content=json.dumps(
+            content=to_json(
                 [
-                    {
-                        "pk": self._job_id,
-                        "inputs": invocation_inputs,
-                        "output_bucket_name": settings.COMPONENTS_OUTPUT_BUCKET_NAME,
-                        "output_prefix": self._io_prefix,
-                    }
+                    InferenceTask(
+                        pk=self._job_id,
+                        inputs=invocation_inputs,
+                        output_bucket_name=settings.COMPONENTS_OUTPUT_BUCKET_NAME,
+                        output_prefix=self._io_prefix,
+                        timeout=self._time_limit,
+                    )
                 ]
-            ).encode("utf-8"),
+            ),
             key=self._invocation_key,
         )
 
diff --git a/app/grandchallenge/components/models.py b/app/grandchallenge/components/models.py
@@ -1619,6 +1619,34 @@ class ComponentJob(FieldChangeMixin, UUIDModel):
     attempt = models.PositiveSmallIntegerField(editable=False, default=0)
     stdout = models.TextField()
     stderr = models.TextField(default="")
+    exec_duration = models.DurationField(
+        null=True,
+        default=None,
+        editable=False,
+        help_text=(
+            "The duration of the execution, if measured. "
+            "Excludes data validation, container pulling, model downloading, "
+            "data downloading and data uploading times. "
+            "Includes model loading time, input data loading time, "
+            "processing time, output data writing time and "
+            "any delays from shared hardware issues."
+        ),
+    )
+    invoke_duration = models.DurationField(
+        null=True,
+        default=None,
+        editable=False,
+        help_text=(
+            "The duration of the invocation, if measured. "
+            "Excludes data validation, container pulling, model downloading, "
+            "data downloading and data uploading times. "
+            "Potentially excludes model loading time, depending on the "
+            "users implementation. "
+            "Includes input data loading time, "
+            "processing time, output data writing time and "
+            "any delays from shared hardware issues."
+        ),
+    )
     runtime_metrics = models.JSONField(default=dict, editable=False)
     error_message = models.CharField(max_length=1024, default="")
     detailed_error_message = models.JSONField(blank=True, default=dict)
diff --git a/app/grandchallenge/evaluation/migrations/0101_evaluation_exec_duration_evaluation_invoke_duration.py b/app/grandchallenge/evaluation/migrations/0101_evaluation_exec_duration_evaluation_invoke_duration.py
@@ -0,0 +1,33 @@
+# Generated by Django 4.2.25 on 2025-10-22 12:51
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ("evaluation", "0100_alter_evaluation_signing_key"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="evaluation",
+            name="exec_duration",
+            field=models.DurationField(
+                default=None,
+                editable=False,
+                help_text="The duration of the execution, if measured. Excludes data validation, container pulling, model downloading, data downloading and data uploading times. Includes model loading time, input data loading time, processing time, output data writing time and any delays from shared hardware issues.",
+                null=True,
+            ),
+        ),
+        migrations.AddField(
+            model_name="evaluation",
+            name="invoke_duration",
+            field=models.DurationField(
+                default=None,
+                editable=False,
+                help_text="The duration of the invocation, if measured. Excludes data validation, container pulling, model downloading, data downloading and data uploading times. Potentially excludes model loading time, depending on the users implementation. Includes input data loading time, processing time, output data writing time and any delays from shared hardware issues.",
+                null=True,
+            ),
+        ),
+    ]
diff --git a/app/tests/components_tests/test_amazon_sagemaker_training_backend.py b/app/tests/components_tests/test_amazon_sagemaker_training_backend.py
@@ -133,7 +133,7 @@ def test_invocation_prefix():
     ),
 )
 def test_transform_job_name(model, container, container_model, key):
-    j = model(pk=uuid4())
+    j = model(pk=uuid4(), time_limit=60)
     setattr(j, container, container_model(pk=uuid4()))
     executor = AmazonSageMakerTrainingExecutor(**j.executor_kwargs)
 
@@ -169,7 +169,7 @@ def test_invocation_json(settings):
         time_limit=60,
         requires_gpu_type=GPUTypeChoices.NO_GPU,
         use_warm_pool=False,
-        signing_key=b"",
+        signing_key=b"totallysecret",
     )
 
     with Stubber(executor._sagemaker_client) as s:
@@ -205,6 +205,7 @@ def test_invocation_json(settings):
                     "GRAND_CHALLENGE_COMPONENT_WRITABLE_DIRECTORIES": "/opt/ml/output/data:/opt/ml/model:/opt/ml/input/data/ground_truth:/opt/ml/checkpoints:/tmp",
                     "GRAND_CHALLENGE_COMPONENT_POST_CLEAN_DIRECTORIES": "/opt/ml/output/data:/opt/ml/model:/opt/ml/input/data/ground_truth",
                     "GRAND_CHALLENGE_COMPONENT_MAX_MEMORY_MB": "7168",
+                    "GRAND_CHALLENGE_COMPONENT_SIGNING_KEY_HEX": "746f74616c6c79736563726574",
                 },
                 "VpcConfig": {
                     "SecurityGroupIds": [
@@ -216,6 +217,7 @@ def test_invocation_json(settings):
             },
         )
         executor.provision(input_civs=[], input_prefixes={})
+        executor.execute()  # Required to validate expected_params in the stubber
 
     with io.BytesIO() as fileobj:
         executor._s3_client.download_fileobj(
@@ -239,6 +241,7 @@ def test_invocation_json(settings):
             "output_bucket_name": "grand-challenge-components-outputs",
             "output_prefix": f"/io/algorithms/job/{pk}",
             "pk": f"algorithms-job-{pk}",
+            "timeout": "PT1M",
         }
     ]
 
diff --git a/app/tests/components_tests/test_backends.py b/app/tests/components_tests/test_backends.py
@@ -337,6 +337,7 @@ def test_invocation_json(settings):
             "output_bucket_name": "grand-challenge-components-outputs",
             "output_prefix": f"/io/test/test/{job_pk}",
             "pk": f"test-test-{job_pk}",
+            "timeout": "PT1M40S",
         },
     ]
 
@@ -667,6 +668,7 @@ def test_dicom_get_provisioning_tasks():
             "output_bucket_name": "grand-challenge-components-outputs",
             "output_prefix": f"/io/test/test/{job_pk}",
             "pk": f"test-test-{job_pk}",
+            "timeout": "PT1M40S",
         },
     ]
 
diff --git a/dockerfiles/web-base/Dockerfile b/dockerfiles/web-base/Dockerfile
@@ -78,7 +78,7 @@ RUN mkdir -p /opt/docker \
 
 ENV PYTHONUNBUFFERED=1\
     AWS_XRAY_SDK_ENABLED=false\
-    COMPONENTS_SAGEMAKER_SHIM_VERSION=0.5.0\
+    COMPONENTS_SAGEMAKER_SHIM_VERSION=0.6.0\
     PATH="/opt/grand-challenge/.venv/bin:/home/django/.local/bin:${PATH}"
 
 RUN mkdir -p /opt/grand-challenge /app /static /opt/sagemaker-shim /opt/virtualenvs \
@@ -90,7 +90,7 @@ USER django:django
 # Fetch and install sagemaker shim for shimming containers
 RUN mkdir -p /opt/sagemaker-shim \
     && wget "https://github.com/DIAGNijmegen/rse-sagemaker-shim/releases/download/v${COMPONENTS_SAGEMAKER_SHIM_VERSION}/sagemaker-shim-${COMPONENTS_SAGEMAKER_SHIM_VERSION}-Linux-x86_64.tar.gz" -P /opt/sagemaker-shim/ \
-    && echo "e0fe6715342c706a14feb0be9a4343c9cf66efa83a14131b9bf25a8b189d50c6  /opt/sagemaker-shim/sagemaker-shim-${COMPONENTS_SAGEMAKER_SHIM_VERSION}-Linux-x86_64.tar.gz" | shasum -c - || exit 1 \
+    && echo "a3c33f65ee72e039dd90c9d7a3460de33abdd020f530164b8666975a1ae5e192  /opt/sagemaker-shim/sagemaker-shim-${COMPONENTS_SAGEMAKER_SHIM_VERSION}-Linux-x86_64.tar.gz" | shasum -c - || exit 1 \
     && tar -C /opt/sagemaker-shim/ -xzvf "/opt/sagemaker-shim/sagemaker-shim-${COMPONENTS_SAGEMAKER_SHIM_VERSION}-Linux-x86_64.tar.gz" \
     && rm "/opt/sagemaker-shim/sagemaker-shim-${COMPONENTS_SAGEMAKER_SHIM_VERSION}-Linux-x86_64.tar.gz"
 
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -337,6 +337,7 @@ def test_invocation_json(settings):`
`337`	`337`	`"output_bucket_name": "grand-challenge-components-outputs",`
`338`	`338`	`"output_prefix": f"/io/test/test/{job_pk}",`
`339`	`339`	`"pk": f"test-test-{job_pk}",`
	`340`	`+ "timeout": "PT1M40S",`
`340`	`341`	`},`
`341`	`342`	`]`
`342`	`343`
`@@ -667,6 +668,7 @@ def test_dicom_get_provisioning_tasks():`
`667`	`668`	`"output_bucket_name": "grand-challenge-components-outputs",`
`668`	`669`	`"output_prefix": f"/io/test/test/{job_pk}",`
`669`	`670`	`"pk": f"test-test-{job_pk}",`
	`671`	`+ "timeout": "PT1M40S",`
`670`	`672`	`},`
`671`	`673`	`]`
`672`	`674`