Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion app/grandchallenge/components/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1307,9 +1307,17 @@ def delete_stale_provisioned_concurrency_configs(self):
@acks_late_micro_short_task
@transaction.atomic
def preload_interactive_algorithms():
from grandchallenge.reader_studies.models import Question
from grandchallenge.reader_studies.models import Question, ReaderStudy
from grandchallenge.workstations.models import Session

reader_studies_out_of_budget = [
rs.pk
for rs in ReaderStudy.objects.exclude(max_credits__isnull=True).only(
"pk", "max_credits"
)
if not rs.has_budget
]

Comment on lines +1313 to +1320
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This query has an N+1 problem (due to expensive queries in credits_consumed). It's an asynchronous task, but still. Currently looking at annotating credits_consumed in the DB to make this more efficient.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Struggling with turning credits_consumed into an annotation. It involves nested aggregates, and gets quite complex. Maybe prefetching the related objects is good enough here? In general, I would like to know how bad the performance is before I continue trying to optimize this.

@jmsmkn could you compare timings for the following 2 queries:

import time
from grand_challenge.reader_studies.models import ReaderStudy


def query_without_prefetch():
    return [
        rs.pk
        for rs in ReaderStudy.objects.exclude(max_credits__isnull=True).only(
            "pk", "max_credits"
        )
        if not rs.has_budget
    ]


def query_with_prefetch():
    return [
        rs.pk
        for rs in ReaderStudy.objects.exclude(max_credits__isnull=True)
        .prefetch_related(
            "session_utilizations__reader_studies",
            "endpoint_utilizations__reader_studies",
        )
        .only("pk", "max_credits")
        if not rs.has_budget
    ]


def benchmark(label, func):
    start = time.perf_counter()
    result = func()
    end = time.perf_counter()

    return {
        "label": label,
        "time_seconds": end - start,
    }


def main():
    results = [
        benchmark("without_prefetch", query_without_prefetch),
        benchmark("with_prefetch", query_with_prefetch),
    ]

    for r in results:
        print(f"{r['label']}:")
        print(f"  time: {r['time_seconds']:.6f}s")


if __name__ == "__main__":
    main()

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

without_prefetch:
  time: 0.294184s
with_prefetch:
  time: 0.203041s

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. Those timings seem fine to me for an asynchronous task and not worth the added complexity of moving the consumed credits into a DB annotation.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would still be worth including the prefetching if those objects are going to be used.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Very true, I had that as a local commit, but forgot to push!

active_interactive_algorithms = (
Question.objects.filter(
reader_study__workstation_sessions__status__in=[
Expand All @@ -1318,6 +1326,7 @@ def preload_interactive_algorithms():
Session.RUNNING,
],
)
.exclude(reader_study__pk__in=reader_studies_out_of_budget)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason to use the negative here? Filtering is going to be more efficient than excluding.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No. This seemed easier to read to me and I didn't know that filtering is more efficient than excluding. I'll change it to filter().

.exclude(interactive_algorithm="")
.values_list("interactive_algorithm", flat=True)
.distinct()
Expand Down
53 changes: 53 additions & 0 deletions app/tests/components_tests/test_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@
UserUploadFactory,
create_upload_from_file,
)
from tests.utilization_tests.factories import SessionUtilizationFactory


@pytest.mark.django_db
Expand Down Expand Up @@ -1322,6 +1323,58 @@ def test_preload_interactive_algorithms(settings):
assert mock_instance.consolidate.call_count == 1


@pytest.mark.django_db
def test_preload_interactive_algorithms_excludes_reader_studies_without_budget(
settings,
):
arn = f"arn:aws:lambda:eu-central-1:1234567890:function:org-proj-e-uls23-baseline-{uuid.uuid4()}"

settings.INTERACTIVE_ALGORITHMS_LAMBDA_FUNCTIONS = {
"io_bucket_name": "org-proj-e-some-bucket",
"lambda_functions": [
{
# Add a uuid to avoid cache key clashes in testing
"arn": arn,
"internal_name": "uls23-baseline",
"minimum_duration": 1,
"timeout": 60,
"version": "1",
}
],
}

rs_with_exhausted_credit = ReaderStudyFactory(max_credits=100)
session_utilization = SessionUtilizationFactory(
duration=timedelta(hours=1)
)
session_utilization.reader_studies.add(rs_with_exhausted_credit)

QuestionFactory(
reader_study=rs_with_exhausted_credit,
interactive_algorithm=InteractiveAlgorithmLambdaChoices.ULS23_BASELINE,
)

assert not rs_with_exhausted_credit.has_budget

with patch(
"grandchallenge.components.tasks.InteractiveAlgorithmLambda"
) as mock_interactive_algorithm:
mock_instance = mock_interactive_algorithm.return_value
mock_instance.consolidate.return_value = "mocked_consolidation_result"

assert preload_interactive_algorithms() == {
"uls23-baseline": "mocked_consolidation_result"
}

mock_interactive_algorithm.assert_any_call(
arn=arn,
qualifier="1",
should_be_active=False,
)

assert mock_instance.consolidate.call_count == 1


@pytest.mark.django_db
@pytest.mark.parametrize(
"image_factory, job_model_factory, image_attribute_name",
Expand Down
Loading