posthog/products/cohorts/backend/models/dependencies.py at 3362c1feecb2ca49698cc0a14513bade5bdfef0b · PostHog/posthog · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
from collections import defaultdict
from typing import Any

from django.core.cache import cache
from django.db import transaction
from django.db.models import Q, TextField
from django.db.models.functions import Cast
from django.db.models.signals import post_delete, post_save, pre_save
from django.dispatch import receiver

import posthoganalytics
from prometheus_client import Counter
from rest_framework.exceptions import ValidationError
from structlog import get_logger

from posthog.models.team.team import Team
from posthog.redis import get_client as get_redis_client

from products.cohorts.backend.models.cohort import Cohort, CohortType, is_cohort_recalculation_only_save

logger = get_logger(__name__)
DEPENDENCY_CACHE_TIMEOUT = 7 * 24 * 60 * 60  # 1 week
COHORT_BACKFILL_DEBOUNCE_SECONDS = 300  # 5 minutes
COHORT_BACKFILL_REDIS_TTL_SECONDS = 300  # matches countdown; task reads fresh state at execution time


# Prometheus metrics for cache hit/miss tracking
COHORT_DEPENDENCY_CACHE_COUNTER = Counter(
    "posthog_cohort_dependency_cache_requests_total",
    "Total number of cohort dependency cache requests",
    labelnames=["cache_type", "result"],
)


def _cohort_dependencies_key(cohort_id: int) -> str:
    return f"cohort:dependencies:{cohort_id}"


def _cohort_dependents_key(cohort_id: int) -> str:
    return f"cohort:dependents:{cohort_id}"


# Set of behavioral (flag-incompatible) cohort ids per team, hidden from the feature-flag
# property picker. Cached because the flag's cohort typeahead hits the cohorts list endpoint
# on every keystroke, and recomputing the dependency graph there means loading every cohort
# for the team into memory. The TTL is a backstop; the cache is invalidated whenever a cohort
# in the team changes (see cohort_changed / cohort_deleted). It is keyed on
# allow_realtime_backfilled because that toggles which realtime cohorts count as seeds.
BEHAVIORAL_COHORT_IDS_CACHE_TIMEOUT = 60 * 60  # 1 hour


def _behavioral_cohort_ids_key(team_id: int, allow_realtime_backfilled: bool) -> str:
    return f"cohort:flag_excluded_behavioral_ids:{team_id}:{int(allow_realtime_backfilled)}"


def _build_cohort_dependency_graph(all_cohorts: dict[int, Cohort]) -> tuple[dict[int, set[int]], set[int]]:
    """Build a directed graph of cohort dependencies and identify behavioral cohorts.

    Returns (adjacency_list, behavioral_cohort_ids). Static cohorts are skipped: they have
    pre-computed membership and don't re-evaluate their filters, so they're always safe to
    use regardless of filter type.
    """
    graph: dict[int, set[int]] = defaultdict(set)
    behavioral_cohorts: set[int] = set()

    def check_property_values(values: Any, source_id: int) -> None:
        if not isinstance(values, list):
            return

        for value in values:
            if not isinstance(value, dict):
                continue

            if value.get("type") == "behavioral":
                behavioral_cohorts.add(source_id)
            elif value.get("type") == "cohort":
                try:
                    target_id = int(value.get("value", "0"))
                except ValueError:
                    continue
                if target_id in all_cohorts:
                    graph[source_id].add(target_id)
            elif value.get("type") in ("AND", "OR") and value.get("values"):
                check_property_values(value["values"], source_id)

    for cohort_id, cohort in all_cohorts.items():
        if cohort.is_static:
            continue
        if cohort.filters:
            properties = cohort.filters.get("properties", {})
            if isinstance(properties, dict):
                check_property_values(properties.get("values", []), cohort_id)

    return graph, behavioral_cohorts


def find_behavioral_cohorts(all_cohorts: dict[int, Cohort], *, allow_realtime_backfilled: bool = False) -> set[int]:
    """Find cohorts that are behavioral, or reference (transitively) a behavioral cohort.

    A cohort is affected if it's a behavioral seed, or references one through the dependency
    graph. We walk the *reverse* graph once from the seeds (O(V+E)) — every node that can
    reach a seed via forward edges is affected.

    When allow_realtime_backfilled is True, realtime cohorts that have been backfilled are
    not seeds: they can be evaluated via the cohort_membership table during flag evaluation.
    (They can still be pulled in if they reference another seed.)
    """
    graph, behavioral_cohorts = _build_cohort_dependency_graph(all_cohorts)

    flag_compatible: set[int] = set()
    if allow_realtime_backfilled:
        flag_compatible = {
            cid for cid in behavioral_cohorts if (cohort := all_cohorts.get(cid)) and cohort.is_flag_compatible
        }
    seeds = behavioral_cohorts - flag_compatible

    # Reverse adjacency: target -> sources that reference it.
    reverse: dict[int, set[int]] = defaultdict(set)
    for source_id, targets in graph.items():
        for target_id in targets:
            reverse[target_id].add(source_id)

    affected = set(seeds)
    stack = list(seeds)
    while stack:
        node = stack.pop()
        for source_id in reverse.get(node, ()):
            if source_id not in affected:
                affected.add(source_id)
                stack.append(source_id)

    return affected


def _compute_flag_excluded_behavioral_cohort_ids(team_id: int, *, allow_realtime_backfilled: bool) -> set[int]:
    # Only non-static cohorts whose filters reference a behavioral node or another cohort can
    # be a seed or reach one; the rest are leaves that never get excluded. Filtering them out
    # in SQL keeps the in-memory graph — and the JSON we parse — small. The bare-word match
    # can't produce false negatives: a behavioral or cohort node always serializes the literal
    # "behavioral"/"cohort" substring. A false positive (e.g. a person-property value of
    # "cohort") only loads an extra leaf, which the graph walk then ignores.
    graph_source = (
        Cohort.objects.filter(team_id=team_id, deleted=False, is_static=False)
        .annotate(_filters_text=Cast("filters", output_field=TextField()))
        .filter(Q(_filters_text__icontains="behavioral") | Q(_filters_text__icontains="cohort"))
        .only("id", "is_static", "filters", "cohort_type", "last_backfill_person_properties_at")
    )
    all_cohorts = {cohort.id: cohort for cohort in graph_source}
    return find_behavioral_cohorts(all_cohorts, allow_realtime_backfilled=allow_realtime_backfilled)


def get_flag_excluded_behavioral_cohort_ids(team_id: int, *, allow_realtime_backfilled: bool | None) -> set[int]:
    """Behavioral (flag-incompatible) cohort ids for a team, cached across requests."""
    # feature_enabled can return None when the flag can't be evaluated; normalize so the
    # cache key is stable and the compute path sees a real bool.
    allow_realtime_backfilled = bool(allow_realtime_backfilled)
    cache_key = _behavioral_cohort_ids_key(team_id, allow_realtime_backfilled)
    cached = cache.get(cache_key)
    if cached is not None:  # empty list is a valid cached result, not a miss
        return set(cached)

    behavioral_cohort_ids = _compute_flag_excluded_behavioral_cohort_ids(
        team_id, allow_realtime_backfilled=allow_realtime_backfilled
    )
    cache.set(cache_key, list(behavioral_cohort_ids), timeout=BEHAVIORAL_COHORT_IDS_CACHE_TIMEOUT)
    return behavioral_cohort_ids


def _invalidate_team_behavioral_cohort_cache(team_id: int) -> None:
    cache.delete_many(
        [
            _behavioral_cohort_ids_key(team_id, allow_realtime_backfilled=True),
            _behavioral_cohort_ids_key(team_id, allow_realtime_backfilled=False),
        ]
    )


def extract_cohort_dependencies(cohort: Cohort) -> set[int]:
    """
    Extract cohort dependencies from the given cohort.
    """
    dependencies = set()
    if not cohort.deleted:
        try:
            for prop in cohort.properties.flat:
                if prop.type == "cohort" and isinstance(prop.value, int) and prop.value != cohort.id:
                    dependencies.add(prop.value)
        except ValidationError as e:
            COHORT_DEPENDENCY_CACHE_COUNTER.labels(cache_type="dependencies", result="invalid").inc()
            logger.warning("Skipping cohort with invalid filters", cohort_id=cohort.id, error=str(e))
    return dependencies


def get_cohort_dependencies(cohort: Cohort, _warming: bool = False) -> list[int]:
    """
    Get the list of cohort IDs that the given cohort depends on.
    """
    cache_key = _cohort_dependencies_key(cohort.id)

    # Check if value exists in cache first
    cache_hit = cache.has_key(cache_key)

    def compute_dependencies():
        if not _warming:
            COHORT_DEPENDENCY_CACHE_COUNTER.labels(cache_type="dependencies", result="miss").inc()
        return list(extract_cohort_dependencies(cohort))

    if cache_hit and not _warming:
        COHORT_DEPENDENCY_CACHE_COUNTER.labels(cache_type="dependencies", result="hit").inc()

    result = cache.get_or_set(
        cache_key,
        compute_dependencies,
        timeout=DEPENDENCY_CACHE_TIMEOUT,
    )

    if result is None:
        logger.error("Cohort dependencies cache returned None", cohort_id=cohort.id)
    return result or []


def get_cohort_dependents(cohort: Cohort | int) -> list[int]:
    """
    Get the list of cohort IDs that depend on the given cohort.
    Can accept either a Cohort object or a cohort ID. If only an ID is provided
    and there's a cache miss, the team_id will be queried from the database.
    """
    cohort_id = cohort.id if isinstance(cohort, Cohort) else cohort
    cache_key = _cohort_dependents_key(cohort_id)

    # Check if value exists in cache first
    cache_hit = cache.has_key(cache_key)

    def compute_or_fallback() -> list[int]:
        COHORT_DEPENDENCY_CACHE_COUNTER.labels(cache_type="dependents", result="miss").inc()
        # If we only have an ID, query the database for team_id
        if isinstance(cohort, int):
            try:
                team_id = Cohort.objects.filter(pk=cohort_id, deleted=False).values_list("team_id", flat=True).first()
                if team_id is None:
                    logger.warning("Cohort not found when computing dependents", cohort_id=cohort_id)
                    return []
            except Exception as e:
                logger.exception("Failed to fetch team_id for cohort", cohort_id=cohort_id, error=str(e))
                return []
        else:
            team_id = cohort.team_id

        warm_team_cohort_dependency_cache(team_id)
        return cache.get(cache_key, [])

    if cache_hit:
        COHORT_DEPENDENCY_CACHE_COUNTER.labels(cache_type="dependents", result="hit").inc()

    result = cache.get_or_set(cache_key, compute_or_fallback, timeout=DEPENDENCY_CACHE_TIMEOUT)
    if result is None:
        logger.error("Cohort dependents cache returned None", cohort_id=cohort_id)
    return result or []


def warm_team_cohort_dependency_cache(team_id: int, batch_size: int = 1000):
    """
    Preloads the cohort dependencies and dependents cache for a given team.
    """
    dependents_map: dict[str, list[int]] = {}
    for cohort in Cohort.objects.filter(team_id=team_id, deleted=False).iterator(chunk_size=batch_size):
        # Any invalidated dependencies cache is rebuilt here
        dependents_map.setdefault(_cohort_dependents_key(cohort.id), [])
        dependencies = get_cohort_dependencies(cohort, _warming=True)
        # Dependency keys aren't fully invalidated; make sure they don't expire.
        cache.touch(_cohort_dependencies_key(cohort.id), timeout=DEPENDENCY_CACHE_TIMEOUT)
        # Build reverse map
        for dep_id in dependencies:
            dependents_map.setdefault(_cohort_dependents_key(dep_id), []).append(cohort.id)
    cache.set_many(dependents_map, timeout=DEPENDENCY_CACHE_TIMEOUT)


def _on_cohort_changed(cohort: Cohort, always_invalidate: bool = False):
    new_dependencies = extract_cohort_dependencies(cohort)
    existing_dependencies = cache.get(_cohort_dependencies_key(cohort.id))
    dependencies_changed = existing_dependencies is None or set(existing_dependencies) != new_dependencies

    # If the dependencies haven't changed, no need to refresh the cache
    if not always_invalidate and not cohort.deleted and not dependencies_changed:
        return

    cache.delete(_cohort_dependencies_key(cohort.id))
    cache.delete(_cohort_dependents_key(cohort.id))

    if existing_dependencies:
        for dep_id in existing_dependencies:
            cache.delete(_cohort_dependents_key(dep_id))

    warm_team_cohort_dependency_cache(cohort.team_id)


def _has_person_property_filters(cohort: Cohort) -> bool:
    """
    Check if a cohort has person property filters in its filters.
    Used to determine if backfill should be triggered.
    """
    return bool(_extract_person_property_filters(cohort))


def _person_property_filters_changed(cohort: Cohort) -> bool:
    """
    Check if person property filters have changed by comparing current filters
    with the previous version stored in pre_save.
    """
    try:
        # For new cohorts, always trigger if they have person property filters
        if not cohort.pk:
            return True

        # Check if we have the previous state stored from pre_save
        previous_filters = getattr(cohort, "_previous_person_property_filters", None)
        if previous_filters is None:
            # No previous state available, assume changed to be safe
            return True

        # Extract current person property filters
        current_filters = _extract_person_property_filters(cohort)

        # Compare the filters - they changed if they're not equal
        return current_filters != previous_filters

    except Exception as e:
        logger.warning(
            "error_checking_person_property_filter_changes",
            cohort_id=cohort.pk,
            error=str(e),
        )
        # If we can't determine if they changed, assume they did to be safe
        return True


def _extract_person_property_filters(cohort: Cohort) -> str:
    """
    Extract a normalized representation of person property filters from a cohort.
    Returns a hash string that can be used for comparison to detect changes.
    This captures both the individual conditions AND their logical structure.
    """
    import json
    import hashlib

    if not cohort.filters:
        return ""

    def normalize_filter_tree(node) -> dict | None:
        """Recursively traverse and normalize the filter tree structure."""
        if not isinstance(node, dict):
            return None

        node_type = node.get("type")

        # Check if this is a group node (AND/OR)
        if node_type in ("AND", "OR"):
            # Recursively process children and filter out None values
            children = []
            for child in node.get("values", []):
                normalized_child = normalize_filter_tree(child)
                if normalized_child is not None:
                    children.append(normalized_child)

            if children:
                # Sort children by their JSON representation to make order-independent
                # For AND/OR operations, the order shouldn't matter logically
                children.sort(key=lambda x: json.dumps(x, sort_keys=True))
                return {"type": node_type, "children": children}
            return None

        # This is a leaf node - check if it's a person property filter
        if node_type == "person" and node.get("conditionHash") is not None:
            # Use conditionHash to represent the condition, preserving structure
            return {"type": "person", "conditionHash": node.get("conditionHash")}

        return None

    properties = cohort.filters.get("properties")
    if not properties:
        return ""

    normalized_tree = normalize_filter_tree(properties)
    if not normalized_tree:
        return ""

    # Convert to a stable JSON representation and hash it
    normalized_json = json.dumps(normalized_tree, sort_keys=True, separators=(",", ":"))
    return hashlib.sha256(normalized_json.encode()).hexdigest()


def _trigger_cohort_backfill(cohort: Cohort) -> None:
    """
    Trigger backfill for a realtime cohort with person properties.
    Debounces with a 5-minute delay so rapid re-saves only trigger one backfill.
    """
    try:
        from posthog.tasks.calculate_cohort import trigger_cohort_backfill_task

        redis_client = get_redis_client()
        lock_key = f"cohort_backfill_pending:{cohort.pk}"

        # Atomic set-if-not-exists with TTL. Returns True only for the first
        # caller within the window; subsequent saves are debounced.
        if redis_client.set(lock_key, 1, nx=True, ex=COHORT_BACKFILL_REDIS_TTL_SECONDS):
            logger.info(
                "triggering_cohort_backfill_on_conditions_change",
                cohort_id=cohort.pk,
                team_id=cohort.team_id,
                cohort_type=cohort.cohort_type,
                debounce_seconds=COHORT_BACKFILL_DEBOUNCE_SECONDS,
            )
            try:
                trigger_cohort_backfill_task.apply_async(
                    args=[cohort.team_id, cohort.pk],
                    countdown=COHORT_BACKFILL_DEBOUNCE_SECONDS,
                )
            except Exception:
                # Release the lock so the next save can retry scheduling
                redis_client.delete(lock_key)
                raise
        else:
            logger.info(
                "cohort_backfill_already_pending",
                cohort_id=cohort.pk,
                team_id=cohort.team_id,
            )

    except Exception as e:
        logger.exception(
            "failed_to_trigger_cohort_backfill",
            cohort_id=cohort.pk,
            team_id=cohort.team_id,
            error=str(e),
        )


@receiver(pre_save, sender=Cohort)
def cohort_pre_save(sender, instance, **kwargs):
    """
    Capture the previous state of person property filters before save.
    This is needed to compare with the new state in post_save.
    """
    try:
        # Skip non-realtime cohorts to avoid extra DB queries
        if not instance.pk or instance.cohort_type != CohortType.REALTIME:
            instance._previous_person_property_filters = ""
            return

        # Check if filters field is being updated - if not, skip the expensive DB read
        update_fields = kwargs.get("update_fields")
        if update_fields is not None and "filters" not in update_fields:
            instance._previous_person_property_filters = ""
            return

        # Get the previous version from database
        previous_cohort = Cohort.objects.get(pk=instance.pk)
        # Store the previous person property filters hash on the instance
        instance._previous_person_property_filters = _extract_person_property_filters(previous_cohort)
    except Cohort.DoesNotExist:
        # Cohort doesn't exist yet (should not happen), treat as new
        instance._previous_person_property_filters = ""
    except Exception as e:
        logger.warning(
            "error_capturing_previous_person_property_filters",
            cohort_id=instance.pk,
            error=str(e),
        )
        # If we can't capture previous state, mark as None to be safe
        instance._previous_person_property_filters = None


@receiver(post_save, sender=Cohort)
def cohort_changed(sender, instance, **kwargs):
    """
    Clear and rebuild dependency caches when cohort changes.
    """
    if is_cohort_recalculation_only_save(kwargs):
        return

    transaction.on_commit(lambda: _on_cohort_changed(instance))
    transaction.on_commit(lambda: _invalidate_team_behavioral_cohort_cache(instance.team_id))


@receiver(post_save, sender=Cohort)
def cohort_conditions_changed_backfill(sender, instance, **kwargs):
    """
    Trigger backfill when realtime cohort person property conditions change.
    This ensures that person property filters are properly backfilled
    when cohort filters are modified.
    """
    # Skip if this is only a recalculation update
    if is_cohort_recalculation_only_save(kwargs):
        return

    # Skip if filters field is not being updated - matches pre_save logic
    update_fields = kwargs.get("update_fields")
    if update_fields is not None and "filters" not in update_fields:
        return

    # Skip if cohort is not realtime
    if instance.cohort_type != CohortType.REALTIME:
        return

    # Skip if cohort is static
    if instance.is_static:
        return

    # Skip if cohort is deleted
    if instance.deleted:
        return

    # Check if this is a new cohort (created=True) or an update
    is_new = kwargs.get("created", False)

    if is_new:
        # For new cohorts, only trigger if they have person property filters
        if not _has_person_property_filters(instance):
            return
    else:
        # For updates, only trigger if person property filters actually changed
        if not _person_property_filters_changed(instance):
            return

    # Check feature flag before triggering backfill
    if not posthoganalytics.feature_enabled(
        "cohort-backfill-on-change",
        str(instance.team_id),
        groups={"team": str(instance.team_id)},
        send_feature_flag_events=False,
    ):
        return

    # Use transaction.on_commit to ensure backfill runs after the current transaction
    transaction.on_commit(lambda: _trigger_cohort_backfill(instance))


@receiver(post_delete, sender=Cohort)
def cohort_deleted(sender, instance, **kwargs):
    """
    Clear and rebuild dependency caches when cohort is deleted.
    """
    transaction.on_commit(lambda: _on_cohort_changed(instance, always_invalidate=True))
    transaction.on_commit(lambda: _invalidate_team_behavioral_cohort_cache(instance.team_id))


@receiver(post_delete, sender=Team)
def clear_team_cohort_dependency_cache(sender, instance: Team, **kwargs):
    """
    Clear cohort dependency caches for all cohorts belonging to the deleted team.
    """

    def clear_cache():
        team_cohorts = Cohort.objects.filter(team_id=instance.pk, deleted=False).values_list("id", flat=True)
        for cohort_id in team_cohorts:
            cache.delete(_cohort_dependencies_key(cohort_id))
            cache.delete(_cohort_dependents_key(cohort_id))
        _invalidate_team_behavioral_cohort_cache(instance.pk)

    transaction.on_commit(clear_cache)