Skip to content

Commit dac9c0a

Browse files
authored
Add setting and serialization of exec and invoke durations (#4385)
Closes DIAGNijmegen/rse-roadmap#441
1 parent 8c3b1e8 commit dac9c0a

23 files changed

Lines changed: 304 additions & 108 deletions

File tree

app/config/settings.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -896,6 +896,7 @@ def sentry_before_send(event, hint):
896896
"PAGE_SIZE": 100,
897897
"UNAUTHENTICATED_USER": "guardian.utils.get_anonymous_user",
898898
"DEFAULT_SCHEMA_CLASS": "drf_spectacular.openapi.AutoSchema",
899+
"DURATION_FORMAT": "iso-8601",
899900
}
900901

901902
SPECTACULAR_SETTINGS = {

app/grandchallenge/algorithms/admin.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,7 @@ class JobAdmin(admin.ModelAdmin):
202202
"is_complimentary",
203203
"credits_consumed",
204204
"time_limit",
205+
"exec_duration",
205206
"requires_gpu_type",
206207
"requires_memory_gb",
207208
"use_warm_pool",
@@ -235,6 +236,8 @@ class JobAdmin(admin.ModelAdmin):
235236
"runtime_metrics",
236237
"algorithm_interface",
237238
"time_limit",
239+
"exec_duration",
240+
"invoke_duration",
238241
"job_utilization",
239242
"public",
240243
"algorithm_model",

app/grandchallenge/algorithms/serializers.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
ComponentInterfaceValueSerializer,
3333
HyperlinkedComponentInterfaceValueSerializer,
3434
)
35+
from grandchallenge.core.drf_fields import ISODurationField
3536
from grandchallenge.core.guardian import filter_by_permission
3637
from grandchallenge.core.templatetags.remove_whitespace import oxford_comma
3738
from grandchallenge.hanging_protocols.serializers import (
@@ -143,6 +144,9 @@ class JobSerializer(serializers.ModelSerializer):
143144
source="algorithm_image.algorithm.view_content", read_only=True
144145
)
145146

147+
exec_duration = ISODurationField(read_only=True)
148+
invoke_duration = ISODurationField(read_only=True)
149+
146150
class Meta:
147151
model = Job
148152
fields = [
@@ -156,6 +160,8 @@ class Meta:
156160
"hanging_protocol",
157161
"optional_hanging_protocols",
158162
"view_content",
163+
"exec_duration",
164+
"invoke_duration",
159165
]
160166

161167

app/grandchallenge/components/admin.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,13 +113,18 @@ def requeue_jobs(modeladmin, request, queryset):
113113
jobs = []
114114

115115
for job in queryset:
116+
# If adding fields here remember to add them to the bulk_update fields below
116117
job.status = ComponentJob.RETRY
117118
job.attempt += 1
118-
job.utilization.duration = None
119-
job.utilization.save()
119+
job.exec_duration = None
120+
job.invoke_duration = None
120121
job.use_warm_pool = False
121122
job.error_message = ""
122123
job.detailed_error_message = {}
124+
125+
job.utilization.duration = None
126+
job.utilization.save()
127+
123128
jobs.append(job)
124129

125130
on_commit(job.execute)
@@ -129,6 +134,9 @@ def requeue_jobs(modeladmin, request, queryset):
129134
fields=[
130135
"status",
131136
"attempt",
137+
"exec_duration",
138+
"invoke_duration",
139+
"use_warm_pool",
132140
"error_message",
133141
"detailed_error_message",
134142
],

app/grandchallenge/components/backends/amazon_sagemaker_base.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -357,7 +357,7 @@ def _stop_job_boto(self):
357357
def __init__(self, *args, **kwargs):
358358
super().__init__(*args, **kwargs)
359359

360-
self.__duration = None
360+
self.__utilization_duration = None
361361
self.__runtime_metrics = {}
362362

363363
self.__sagemaker_client = None
@@ -413,8 +413,8 @@ def _cloudwatch_client(self):
413413
return self.__cloudwatch_client
414414

415415
@property
416-
def duration(self):
417-
return self.__duration
416+
def utilization_duration(self):
417+
return self.__utilization_duration
418418

419419
@property
420420
def runtime_metrics(self):
@@ -536,10 +536,10 @@ def _set_duration(self, *, event):
536536
self._get_start_time(event=event)
537537
)
538538
stopped = ms_timestamp_to_datetime(self._get_end_time(event=event))
539-
self.__duration = stopped - started
539+
self.__utilization_duration = stopped - started
540540
except TypeError:
541541
logger.warning("Invalid start or end time, duration undetermined")
542-
self.__duration = None
542+
self.__utilization_duration = None
543543

544544
def _get_log_stream_name(self, *, data_log=False):
545545
response = self._logs_client.describe_log_streams(
@@ -718,7 +718,9 @@ def _handle_failed_job(self, *, event):
718718
"Out of Memory. Please use a larger instance",
719719
):
720720
try:
721-
users_process_exit_code = self._get_task_return_code()
721+
users_process_exit_code = (
722+
self._get_inference_result().return_code
723+
)
722724
except UncleanExit:
723725
users_process_exit_code = None
724726

app/grandchallenge/components/backends/base.py

Lines changed: 48 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import boto3
2222
import botocore
2323
import httpx
24+
import pydantic
2425
from asgiref.sync import async_to_sync
2526
from botocore.auth import SigV4Auth
2627
from botocore.awsrequest import AWSRequest
@@ -288,6 +289,17 @@ class InferenceTask(BaseModel):
288289
timeout: timedelta
289290

290291

292+
class InferenceResult(BaseModel):
293+
model_config = ConfigDict(frozen=True)
294+
295+
pk: str
296+
return_code: int
297+
exec_duration: timedelta | None
298+
invoke_duration: timedelta | None
299+
outputs: list[InferenceIO]
300+
sagemaker_shim_version: str
301+
302+
291303
class Executor(ABC):
292304
def __init__(
293305
self,
@@ -313,11 +325,15 @@ def __init__(
313325
use_warm_pool and settings.COMPONENTS_USE_WARM_POOL
314326
)
315327
self._signing_key = signing_key
328+
self._algorithm_model = algorithm_model
329+
self._ground_truth = ground_truth
330+
331+
self._exec_duration = None
332+
self._invoke_duration = None
316333
self._stdout = []
317334
self._stderr = []
335+
318336
self.__s3_client = None
319-
self._algorithm_model = algorithm_model
320-
self._ground_truth = ground_truth
321337

322338
def provision(self, *, input_civs, input_prefixes):
323339
# We cannot run everything async as it requires database access.
@@ -387,7 +403,15 @@ def stderr(self):
387403

388404
@property
389405
@abstractmethod
390-
def duration(self): ...
406+
def utilization_duration(self): ...
407+
408+
@property
409+
def exec_duration(self):
410+
return self._exec_duration
411+
412+
@property
413+
def invoke_duration(self):
414+
return self._invoke_duration
391415

392416
@property
393417
@abstractmethod
@@ -437,12 +461,13 @@ def _max_memory_mb(self):
437461

438462
@property
439463
def compute_cost_euro_millicents(self):
440-
duration = self.duration
441-
if duration is None:
464+
utilization_duration = self.utilization_duration
465+
if utilization_duration is None:
442466
return None
443467
else:
444468
return duration_to_millicents(
445-
duration=duration, usd_cents_per_hour=self.usd_cents_per_hour
469+
duration=utilization_duration,
470+
usd_cents_per_hour=self.usd_cents_per_hour,
446471
)
447472

448473
@property
@@ -467,7 +492,7 @@ def _invocation_key(self):
467492
return safe_join(self._invocation_prefix, "invocation.json")
468493

469494
@property
470-
def _result_key(self):
495+
def _inference_result_key(self):
471496
return safe_join(
472497
self._io_prefix, ".sagemaker_shim", "inference_result.json"
473498
)
@@ -802,11 +827,11 @@ def _get_upload_input_content_task(*, content, key):
802827
key=key,
803828
)
804829

805-
def _get_task_return_code(self):
830+
def _get_inference_result(self):
806831
try:
807832
response = self._s3_client.get_object(
808833
Bucket=settings.COMPONENTS_OUTPUT_BUCKET_NAME,
809-
Key=self._result_key,
834+
Key=self._inference_result_key,
810835
)
811836
except botocore.exceptions.ClientError as error:
812837
if error.response["Error"]["Code"] == "404":
@@ -834,26 +859,29 @@ def _get_task_return_code(self):
834859
)
835860

836861
try:
837-
result = json.loads(body.decode("utf-8"))
838-
except JSONDecodeError:
862+
inference_result = InferenceResult.model_validate_json(
863+
json_data=body
864+
)
865+
except pydantic.ValidationError as error:
866+
logger.error(error, exc_info=True)
839867
raise ComponentException(
840868
"The invocation request did not return valid json"
841869
)
842870

843-
logger.info(f"{result=}")
871+
logger.info(f"{inference_result=}")
844872

845-
if result["pk"] != self._job_id:
873+
if inference_result.pk != self._job_id:
846874
raise RuntimeError("Wrong result key for this job")
847875

848-
try:
849-
return int(result["return_code"])
850-
except (KeyError, ValueError):
851-
raise ComponentException(
852-
"The invocation response object is not valid"
853-
)
876+
return inference_result
854877

855878
def _handle_completed_job(self):
856-
users_process_exit_code = self._get_task_return_code()
879+
inference_result = self._get_inference_result()
880+
881+
self._exec_duration = inference_result.exec_duration
882+
self._invoke_duration = inference_result.invoke_duration
883+
884+
users_process_exit_code = inference_result.return_code
857885

858886
if users_process_exit_code == 0:
859887
# Job's a good un

app/grandchallenge/components/models.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1736,15 +1736,17 @@ def save(self, *args, **kwargs):
17361736
if adding:
17371737
self.create_utilization()
17381738

1739-
def update_status(
1739+
def update_status( # noqa:C901
17401740
self,
17411741
*,
17421742
status: STATUS_CHOICES,
17431743
stdout: str = "",
17441744
stderr: str = "",
17451745
error_message="",
17461746
detailed_error_message=None,
1747-
duration=None,
1747+
utilization_duration=None,
1748+
exec_duration=None,
1749+
invoke_duration=None,
17481750
compute_cost_euro_millicents=None,
17491751
runtime_metrics=None,
17501752
):
@@ -1765,10 +1767,16 @@ def update_status(
17651767
for key, value in detailed_error_message.items()
17661768
}
17671769

1768-
if duration is not None:
1769-
self.utilization.duration = duration
1770+
if utilization_duration is not None:
1771+
self.utilization.duration = utilization_duration
17701772
self.utilization.save(update_fields=["duration"])
17711773

1774+
if exec_duration is not None:
1775+
self.exec_duration = exec_duration
1776+
1777+
if invoke_duration is not None:
1778+
self.invoke_duration = invoke_duration
1779+
17721780
if compute_cost_euro_millicents is not None:
17731781
self.utilization.compute_cost_euro_millicents = (
17741782
compute_cost_euro_millicents

app/grandchallenge/components/tasks.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -857,7 +857,9 @@ def get_update_status_kwargs(*, executor=None):
857857
return {
858858
"stdout": executor.stdout,
859859
"stderr": executor.stderr,
860-
"duration": executor.duration,
860+
"utilization_duration": executor.utilization_duration,
861+
"exec_duration": executor.exec_duration,
862+
"invoke_duration": executor.invoke_duration,
861863
"compute_cost_euro_millicents": executor.compute_cost_euro_millicents,
862864
"runtime_metrics": executor.runtime_metrics,
863865
}
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
from django.utils.duration import duration_iso_string
2+
from rest_framework.fields import DurationField
3+
4+
5+
class ISODurationField(DurationField):
6+
# TODO - This functionality will in DRF 3.17, remove when released
7+
def to_representation(self, value):
8+
return duration_iso_string(value)

app/grandchallenge/evaluation/admin.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,7 @@ class EvaluationAdmin(admin.ModelAdmin):
180180
"created",
181181
"submission",
182182
"time_limit",
183+
"exec_duration",
183184
"requires_gpu_type",
184185
"requires_memory_gb",
185186
"use_warm_pool",
@@ -221,6 +222,8 @@ class EvaluationAdmin(admin.ModelAdmin):
221222
"runtime_metrics",
222223
"claimed_by",
223224
"ground_truth",
225+
"exec_duration",
226+
"invoke_duration",
224227
"evaluation_utilization",
225228
)
226229
actions = (requeue_jobs, cancel_jobs, deprovision_jobs)

0 commit comments

Comments
 (0)