Skip to content

Commit c1d128a

Browse files
authored
Merge pull request #649 from City-of-Helsinki/link-1408-optimize-distinct
Remove use of distinct in event listing
2 parents 6c8e8fd + 3324685 commit c1d128a

File tree

3 files changed

+150
-86
lines changed

3 files changed

+150
-86
lines changed

events/api.py

Lines changed: 123 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
from django.core.cache import cache
2525
from django.core.exceptions import PermissionDenied
2626
from django.db import transaction
27-
from django.db.models import Count, F, Prefetch, Q
27+
from django.db.models import Count, Exists, F, OuterRef, Prefetch, Q, QuerySet
2828
from django.db.models.functions import Greatest
2929
from django.db.utils import IntegrityError
3030
from django.http import Http404, HttpResponsePermanentRedirect
@@ -2511,15 +2511,47 @@ def _get_queryset_from_cache_many(params, param, cache_name, operator, queryset)
25112511
return queryset
25122512

25132513

2514+
def _find_keyword_replacements(keyword_ids: list[str]) -> tuple[list[Keyword], bool]:
2515+
"""
2516+
Convert a list of keyword ids into keyword objects with
2517+
replacements applied.
2518+
2519+
:return: a pair containing a list of keywords and a boolean indicating
2520+
whether all keywords were found
2521+
"""
2522+
keywords = Keyword.objects.filter(id__in=keyword_ids).select_related("replaced_by")
2523+
found_all_keywords = len(keyword_ids) == len(keywords)
2524+
replaced_keywords = [keyword.get_replacement() or keyword for keyword in keywords]
2525+
return list(set(replaced_keywords)), found_all_keywords
2526+
2527+
2528+
def _filter_events_keyword_or(queryset: QuerySet, keyword_ids: list[str]) -> QuerySet:
2529+
"""
2530+
Given an Event queryset, apply OR filter on keyword ids
2531+
"""
2532+
keywords, __ = _find_keyword_replacements(keyword_ids)
2533+
keyword_ids = [keyword.pk for keyword in keywords]
2534+
2535+
kw_qs = Event.keywords.through.objects.filter(
2536+
event=OuterRef("pk"), keyword__in=keyword_ids
2537+
)
2538+
audience_qs = Event.audience.through.objects.filter(
2539+
event=OuterRef("pk"), keyword__in=keyword_ids
2540+
)
2541+
return queryset.filter(Exists(kw_qs) | Exists(audience_qs))
2542+
2543+
25142544
def _filter_event_queryset(queryset, params, srs=None): # noqa: C901
25152545
"""
25162546
Filter events queryset by params
25172547
(e.g. self.request.query_params ingit EventViewSet)
25182548
"""
2519-
# Filter by string (case insensitive). This searches from all fields
2520-
# which are marked translatable in translation.py
2549+
# Please keep in mind that .distinct() will absolutely kill
2550+
# performance of event queries. To avoid duplicate rows in filters
2551+
# consider using Exists instead.
2552+
# Filtering against the through table can also provide
2553+
# benefits (see _filter_events_keyword_or)
25212554

2522-
# filter to get events with or without a registration.
25232555
val = params.get("registration", None)
25242556
if val and parse_bool(val, "registration"):
25252557
queryset = queryset.exclude(registration=None)
@@ -2617,21 +2649,29 @@ def _filter_event_queryset(queryset, params, srs=None): # noqa: C901
26172649
vals = params.get("keyword_set_AND", None)
26182650
if vals:
26192651
vals = vals.split(",")
2620-
keyword_sets = KeywordSet.objects.filter(id__in=vals)
2621-
for keyword_set in keyword_sets:
2622-
keywords = keyword_set.keywords.all()
2623-
qset = Q(keywords__in=keywords)
2624-
queryset = queryset.filter(qset)
2652+
kw_sets = KeywordSet.objects.filter(id__in=vals).prefetch_related(
2653+
Prefetch("keywords", Keyword.objects.all().only("id"))
2654+
)
2655+
for kw_set in kw_sets:
2656+
kw_ids = [kw.id for kw in kw_set.keywords.all()]
2657+
subqs = Event.objects.filter(id=OuterRef("pk"), keywords__in=kw_ids)
2658+
queryset = queryset.filter(Exists(subqs))
26252659

26262660
vals = params.get("keyword_set_OR", None)
26272661
if vals:
26282662
vals = vals.split(",")
2629-
keyword_sets = KeywordSet.objects.filter(id__in=vals)
2663+
keyword_sets = KeywordSet.objects.filter(id__in=vals).prefetch_related(
2664+
Prefetch("keywords", Keyword.objects.all().only("id"))
2665+
)
26302666
all_keywords = set()
26312667
for keyword_set in keyword_sets:
26322668
keywords = keyword_set.keywords.all()
26332669
all_keywords.update(keywords)
2634-
queryset = queryset.filter(keywords__in=all_keywords)
2670+
2671+
kw_qs = Event.keywords.through.objects.filter(
2672+
event=OuterRef("pk"), keyword__in=all_keywords
2673+
)
2674+
queryset = queryset.filter(Exists(kw_qs))
26352675

26362676
if "local_ongoing_OR_set" in "".join(params):
26372677
count = 1
@@ -2694,8 +2734,11 @@ def _filter_event_queryset(queryset, params, srs=None): # noqa: C901
26942734
for i in all_sets:
26952735
val = params.get(i, None)
26962736
if val:
2697-
val = val.split(",")
2698-
queryset = queryset.filter(keywords__pk__in=val)
2737+
vals = val.split(",")
2738+
kw_qs = Event.keywords.through.objects.filter(
2739+
event=OuterRef("pk"), keyword__in=vals
2740+
)
2741+
queryset = queryset.filter(Exists(kw_qs))
26992742

27002743
val = params.get("internet_based", None)
27012744
if val and parse_bool(val, "internet_based"):
@@ -2707,39 +2750,44 @@ def _filter_event_queryset(queryset, params, srs=None): # noqa: C901
27072750
val = params.get("combined_text", None)
27082751
if val:
27092752
val = val.lower()
2710-
qset = Q()
27112753
vals = val.split(",")
2712-
qsets = []
2754+
2755+
combined_q = Q()
27132756
for val in vals:
2757+
val_q = Q()
27142758
# Free string search from all translated event fields
27152759
event_fields = EventTranslationOptions.fields
27162760
for field in event_fields:
27172761
# check all languages for each field
2718-
qset |= _text_qset_by_translated_field(field, val)
2762+
val_q |= _text_qset_by_translated_field(field, val)
27192763

27202764
# Free string search from all translated place fields
27212765
place_fields = PlaceTranslationOptions.fields
27222766
for field in place_fields:
27232767
location_field = "location__" + field
27242768
# check all languages for each field
2725-
qset |= _text_qset_by_translated_field(location_field, val)
2769+
val_q |= _text_qset_by_translated_field(location_field, val)
27262770

27272771
langs = (
27282772
["fi", "sv"]
27292773
if re.search("[\u00C0-\u00FF]", val)
27302774
else ["fi", "sv", "en"]
27312775
)
27322776
tri = [TrigramSimilarity(f"name_{i}", val) for i in langs]
2733-
keywords = (
2777+
keywords = list(
27342778
Keyword.objects.annotate(simile=Greatest(*tri))
27352779
.filter(simile__gt=0.2)
27362780
.order_by("-simile")[:3]
27372781
)
2738-
if keywords:
2739-
qset |= Q(keywords__in=keywords)
2740-
qsets.append(qset)
2741-
qset = Q()
2742-
queryset = queryset.filter(*qsets)
2782+
2783+
val_q |= Q(keywords__in=keywords)
2784+
2785+
combined_q &= val_q
2786+
2787+
# FYI: Biggest slowdown in this filter is the ordering of keywords simile
2788+
queryset = queryset.filter(
2789+
Exists(Event.objects.filter(combined_q, id=OuterRef("pk")).only("id"))
2790+
)
27432791

27442792
val = params.get("text", None)
27452793
if val:
@@ -2866,75 +2914,46 @@ def _filter_event_queryset(queryset, params, srs=None): # noqa: C901
28662914
val = val.split(",")
28672915
queryset = queryset.filter(location_id__in=val)
28682916

2869-
# Filter by keyword id, multiple ids separated by comma
2870-
val = params.get("keyword", None)
2871-
if val:
2872-
val = val.split(",")
2873-
try:
2874-
# replaced keywords are looked up for backwards compatibility
2875-
val = [
2876-
getattr(Keyword.objects.get(id=kid).replaced_by, "id", None) or kid
2877-
for kid in val
2878-
]
2879-
except Keyword.DoesNotExist:
2880-
# the user asked for an unknown keyword
2881-
queryset = queryset.none()
2882-
queryset = queryset.filter(
2883-
Q(keywords__pk__in=val) | Q(audience__pk__in=val)
2884-
).distinct()
2917+
if val := params.get("keyword", None):
2918+
queryset = _filter_events_keyword_or(queryset, val.split(","))
28852919

28862920
# 'keyword_OR' behaves the same way as 'keyword'
2887-
val = params.get("keyword_OR", None)
2888-
if val:
2889-
val = val.split(",")
2890-
try:
2891-
# replaced keywords are looked up for backwards compatibility
2892-
val = [
2893-
getattr(Keyword.objects.get(id=kid).replaced_by, "id", None) or kid
2894-
for kid in val
2895-
]
2896-
except Keyword.DoesNotExist:
2897-
# the user asked for an unknown keyword
2898-
queryset = queryset.none()
2899-
queryset = queryset.filter(
2900-
Q(keywords__pk__in=val) | Q(audience__pk__in=val)
2901-
).distinct()
2921+
if val := params.get("keyword_OR", None):
2922+
queryset = _filter_events_keyword_or(queryset, val.split(","))
29022923

29032924
# Filter by keyword ids requiring all keywords to be present in event
29042925
val = params.get("keyword_AND", None)
29052926
if val:
2906-
val = val.split(",")
2907-
for keyword_id in val:
2908-
try:
2909-
# replaced keywords are looked up for backwards compatibility
2910-
val = (
2911-
getattr(Keyword.objects.get(id=keyword_id).replaced_by, "id", None)
2912-
or keyword_id
2913-
)
2914-
except Keyword.DoesNotExist:
2915-
# the user asked for an unknown keyword
2916-
queryset = queryset.none()
2917-
queryset = queryset.filter(
2918-
Q(keywords__pk=keyword_id) | Q(audience__pk=keyword_id)
2927+
keywords, found_all_keywords = _find_keyword_replacements(val.split(","))
2928+
2929+
# If some keywords were not found, AND can not match
2930+
if not found_all_keywords:
2931+
return queryset.none()
2932+
2933+
q = Q()
2934+
for keyword in keywords:
2935+
q &= Q(keywords__pk=keyword.pk) | Q(audience__pk=keyword.pk)
2936+
2937+
for keyword in keywords:
2938+
kw_qs = Event.keywords.through.objects.filter(
2939+
event=OuterRef("pk"), keyword=keyword
29192940
)
2920-
queryset = queryset.distinct()
2941+
audience_qs = Event.audience.through.objects.filter(
2942+
event=OuterRef("pk"), keyword=keyword
2943+
)
2944+
queryset = queryset.filter(Exists(kw_qs) | Exists(audience_qs))
29212945

29222946
# Negative filter for keyword ids
29232947
val = params.get("keyword!", None)
29242948
if val:
2925-
val = val.split(",")
2926-
try:
2927-
# replaced keywords are looked up for backwards compatibility
2928-
val = [
2929-
getattr(Keyword.objects.get(id=kid).replaced_by, "id", None) or kid
2930-
for kid in val
2931-
]
2932-
except Keyword.DoesNotExist:
2933-
# the user asked for an unknown keyword
2934-
pass
2949+
keywords, __ = _find_keyword_replacements(val.split(","))
2950+
keyword_ids = [keyword.pk for keyword in keywords]
2951+
2952+
# This yields an AND NOT ((EXISTS.. keywords )) clause in SQL
2953+
# No distinct needed!
29352954
queryset = queryset.exclude(
2936-
Q(keywords__pk__in=val) | Q(audience__pk__in=val)
2937-
).distinct()
2955+
Q(keywords__pk__in=keyword_ids) | Q(audience__pk__in=keyword_ids)
2956+
)
29382957

29392958
# filter only super or non-super events. to be deprecated?
29402959
val = params.get("recurring", None)
@@ -3018,22 +3037,36 @@ def _filter_event_queryset(queryset, params, srs=None): # noqa: C901
30183037
short_desc_arg = {"short_description_" + lang + "__isnull": False}
30193038
q = (
30203039
q
3021-
| Q(in_language__id=lang)
3040+
| Exists(
3041+
Event.in_language.through.objects.filter(
3042+
event=OuterRef("pk"), language=lang
3043+
)
3044+
)
30223045
| Q(**name_arg)
30233046
| Q(**desc_arg)
30243047
| Q(**short_desc_arg)
30253048
)
30263049
else:
3027-
q = q | Q(in_language__id=lang)
3028-
queryset = queryset.filter(q).distinct()
3050+
q = q | Exists(
3051+
Event.in_language.through.objects.filter(
3052+
event=OuterRef("pk"), language=lang
3053+
)
3054+
)
3055+
3056+
queryset = queryset.filter(q)
30293057

30303058
# Filter by in_language field only
30313059
val = params.get("in_language", None)
30323060
if val:
30333061
val = val.split(",")
30343062
q = Q()
30353063
for lang in val:
3036-
q = q | Q(in_language__id=lang)
3064+
q = q | Exists(
3065+
Event.in_language.through.objects.filter(
3066+
event=OuterRef("pk"), language=lang
3067+
)
3068+
)
3069+
30373070
queryset = queryset.filter(q)
30383071

30393072
val = params.get("starts_after", None)
@@ -3119,8 +3152,12 @@ def _filter_event_queryset(queryset, params, srs=None): # noqa: C901
31193152
# Filter by free offer
31203153
val = params.get("is_free", None)
31213154
if val and val.lower() in ["true", "false"]:
3155+
# Include events that have at least one free offer
31223156
if val.lower() == "true":
3123-
queryset = queryset.filter(offers__is_free=True)
3157+
queryset = queryset.filter(
3158+
Exists(Offer.objects.filter(event=OuterRef("pk"), is_free=True))
3159+
)
3160+
# Include events that have no free offers
31243161
elif val.lower() == "false":
31253162
queryset = queryset.exclude(offers__is_free=True)
31263163

@@ -3146,7 +3183,7 @@ def _filter_event_queryset(queryset, params, srs=None): # noqa: C901
31463183
| Q(audience_max_age__lt=upper_boundary)
31473184
| Q(Q(audience_min_age=None) & Q(audience_max_age=None))
31483185
)
3149-
return queryset.distinct()
3186+
return queryset
31503187

31513188

31523189
class EventExtensionFilterBackend(BaseFilterBackend):

events/tests/test_event_get.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -496,21 +496,28 @@ def test_language_filter(api_client, event, event2, event3):
496496

497497
# Finnish should be the default language
498498
get_list_and_assert_events("language=fi", [event, event2, event3])
499+
# in_language is explicit filter on the in_language field, so no results for fi
500+
get_list_and_assert_events("in_language=fi", [])
499501

500502
# Swedish should have two events (matches in_language and name_sv)
501503
get_list_and_assert_events("language=sv", [event, event2])
504+
get_list_and_assert_events("in_language=sv", [event2])
502505

503506
# English should have one event (matches in_language)
504507
get_list_and_assert_events("language=en", [event2])
508+
get_list_and_assert_events("in_language=en", [event2])
505509

506510
# Russian should have one event (matches name_ru)
507511
get_list_and_assert_events("language=ru", [event3])
512+
get_list_and_assert_events("in_language=ru", [])
508513

509514
# Chinese should have no events
510515
get_list_and_assert_events("language=zh_hans", [])
516+
get_list_and_assert_events("in_language=zh_hans", [])
511517

512518
# Estonian should have one event (matches in_language), even without translations available
513519
get_list_and_assert_events("language=et", [event3])
520+
get_list_and_assert_events("in_language=et", [event3])
514521

515522

516523
@pytest.mark.django_db

events/tests/test_keyword.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import pytest
22
from rest_framework.exceptions import ValidationError
33

4+
from events.api import _find_keyword_replacements
45
from events.tests.factories import KeywordFactory
56

67

@@ -86,3 +87,22 @@ def test_keyword_get_replacement_multi_level():
8687
old_keyword_2 = KeywordFactory(deprecated=True, replaced_by=old_keyword_1)
8788

8889
assert old_keyword_2.get_replacement().pk == new_keyword.pk
90+
91+
92+
@pytest.mark.django_db
93+
def test_find_keyword_replacements():
94+
new_keyword = KeywordFactory()
95+
replaced_keyword = KeywordFactory(deprecated=True, replaced_by=new_keyword)
96+
other_keyword = KeywordFactory()
97+
unknown_keyword_id = "keyword:doesnotexist"
98+
99+
assert _find_keyword_replacements([replaced_keyword.pk]) == ([new_keyword], True)
100+
assert _find_keyword_replacements([new_keyword.pk]) == ([new_keyword], True)
101+
assert _find_keyword_replacements([new_keyword.pk, replaced_keyword.pk]) == (
102+
[new_keyword],
103+
True,
104+
)
105+
assert _find_keyword_replacements([other_keyword.pk, unknown_keyword_id]) == (
106+
[other_keyword],
107+
False,
108+
)

0 commit comments

Comments
 (0)