Skip to content

Commit e573ab8

Browse files
Copilotinducer
andauthored
Filter negative durations at DB level; document two-fetch stddev approach; fix stddev avg consistency
Agent-Logs-Url: https://github.com/inducer/relate/sessions/be6291a6-07c2-4d53-b712-44e571e24cae Co-authored-by: inducer <352067+inducer@users.noreply.github.com>
1 parent 57c824f commit e573ab8

1 file changed

Lines changed: 17 additions & 14 deletions

File tree

course/analytics.py

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
import operator
2828
from collections import defaultdict
2929
from dataclasses import dataclass
30+
from datetime import timedelta
3031
from typing import TYPE_CHECKING, Any, ClassVar, final
3132

3233
from django import http
@@ -621,10 +622,15 @@ def make_page_timing_stats_list(
621622
PPerm.included_in_grade_statistics)
622623
)
623624

624-
# Annotate each submitted-answer visit with its elapsed duration and push
625-
# Count/Avg/Min/Max aggregation to the database. StdDev is not universally
625+
# Annotate each submitted-answer visit with its elapsed duration.
626+
# Negative durations (clock-skew or data anomalies where the answer visit
627+
# was recorded before the page view) are excluded at the database level so
628+
# that the aggregated values and the per-row durations fetched for stddev
629+
# are computed from the same filtered set.
630+
# Count/Avg/Min/Max are pushed to the database. StdDev is not universally
626631
# supported across database backends (notably absent from SQLite), so sample
627-
# standard deviation is computed in Python from the per-page duration lists.
632+
# standard deviation is computed in Python from a second fetch of the
633+
# per-page duration values. This is a known two-query approach for stddev.
628634
annotated_qs = (visits_qs
629635
.annotate(preceding_visit_time=preceding_time_sq)
630636
.filter(preceding_visit_time__isnull=False)
@@ -634,6 +640,7 @@ def make_page_timing_stats_list(
634640
output_field=DurationField(),
635641
)
636642
)
643+
.filter(duration__gte=timedelta(0))
637644
)
638645

639646
page_agg = (annotated_qs
@@ -650,12 +657,7 @@ def make_page_timing_stats_list(
650657
page_durations: dict[tuple[str, str], list[float]] = defaultdict(list)
651658
for row in (annotated_qs
652659
.values_list("page_data__group_id", "page_data__page_id", "duration")):
653-
key = (row[0], row[1])
654-
minutes = row[2].total_seconds() / 60
655-
# Skip negative durations; these can arise from clock-skew or data
656-
# anomalies where the answer visit was recorded before the page view.
657-
if minutes >= 0:
658-
page_durations[key].append(minutes)
660+
page_durations[row[0], row[1]].append(row[2].total_seconds() / 60)
659661

660662
# Build result list.
661663
result: list[PageTimingStats] = []
@@ -677,12 +679,13 @@ def make_page_timing_stats_list(
677679
)
678680

679681
# Sample stddev (requires at least two observations).
682+
# avg_time and page_durations are derived from the same filtered set, so
683+
# they are consistent.
680684
stddev_time: float | None = None
681-
if count >= 2 and avg_time is not None:
682-
times = page_durations.get((group_id, page_id), [])
683-
if len(times) >= 2:
684-
variance = sum((t - avg_time) ** 2 for t in times) / (len(times) - 1)
685-
stddev_time = variance ** 0.5
685+
times = page_durations.get((group_id, page_id), [])
686+
if len(times) >= 2 and avg_time is not None:
687+
variance = sum((t - avg_time) ** 2 for t in times) / (len(times) - 1)
688+
stddev_time = variance ** 0.5
686689

687690
result.append(PageTimingStats(
688691
group_id=group_id,

0 commit comments

Comments
 (0)