@@ -145,33 +145,27 @@ def coverage_measurements(
145
145
return aggregate_measurements (queryset ).order_by ("timestamp_bin" )
146
146
147
147
148
- def trigger_backfill (dataset : Dataset ):
148
+ def trigger_backfill (datasets : list [ Dataset ] ):
149
149
"""
150
150
Triggers a backfill for the full timespan of the dataset's repo's commits.
151
151
"""
152
- oldest_commit = (
153
- Commit .objects .filter (repository_id = dataset .repository_id )
154
- .order_by ("timestamp" )
155
- .first ()
152
+ repo_ids = {d .repository_id for d in datasets }
153
+ timeranges = (
154
+ Commit .objects .filter (repository_id__in = repo_ids )
155
+ .values_list ("repository_id" )
156
+ .annotate (start_date = Min ("timestamp" ), end_date = Max ("timestamp" ))
156
157
)
157
158
158
- newest_commit = (
159
- Commit .objects .filter (repository_id = dataset .repository_id )
160
- .order_by ("-timestamp" )
161
- .first ()
162
- )
163
-
164
- if oldest_commit and newest_commit :
165
- # dates to span the entire range of commits
166
- start_date = oldest_commit .timestamp .date ()
167
- start_date = datetime .fromordinal (start_date .toordinal ())
168
- end_date = newest_commit .timestamp .date () + timedelta (days = 1 )
169
- end_date = datetime .fromordinal (end_date .toordinal ())
159
+ timerange_by_repo = {
160
+ repo_id : (start_date , end_date ) for repo_id , start_date , end_date in timeranges
161
+ }
170
162
163
+ for dataset in datasets :
164
+ if dataset .repository_id not in timerange_by_repo :
165
+ continue # there are no commits, and thus nothing to backfill
166
+ start_date , end_date = timerange_by_repo [dataset .repository_id ]
171
167
TaskService ().backfill_dataset (
172
- dataset ,
173
- start_date = start_date ,
174
- end_date = end_date ,
168
+ dataset , start_date = start_date , end_date = end_date
175
169
)
176
170
177
171
@@ -340,42 +334,41 @@ def repository_coverage_measurements_with_fallback(
340
334
If those are not available then we trigger a backfill and return computed results
341
335
directly from the primary database (much slower to query).
342
336
"""
343
- dataset = None
344
337
if settings .TIMESERIES_ENABLED :
345
338
dataset = Dataset .objects .filter (
346
339
name = MeasurementName .COVERAGE .value ,
347
340
repository_id = repository .pk ,
348
341
).first ()
349
342
350
- if settings . TIMESERIES_ENABLED and dataset and dataset .is_backfilled ():
351
- # timeseries data is ready
352
- return coverage_measurements (
353
- interval ,
354
- start_date = start_date ,
355
- end_date = end_date ,
356
- owner_id = repository .author_id ,
357
- repo_id = repository .pk ,
358
- measurable_id = str (repository .pk ),
359
- branch = branch or repository .branch ,
360
- )
361
- else :
362
- if settings . TIMESERIES_ENABLED and not dataset :
343
+ if dataset and dataset .is_backfilled ():
344
+ # timeseries data is ready
345
+ return coverage_measurements (
346
+ interval ,
347
+ start_date = start_date ,
348
+ end_date = end_date ,
349
+ owner_id = repository .author_id ,
350
+ repo_id = repository .pk ,
351
+ measurable_id = str (repository .pk ),
352
+ branch = branch or repository .branch ,
353
+ )
354
+
355
+ if not dataset :
363
356
# we need to backfill
364
357
dataset , created = Dataset .objects .get_or_create (
365
358
name = MeasurementName .COVERAGE .value ,
366
359
repository_id = repository .pk ,
367
360
)
368
361
if created :
369
- trigger_backfill (dataset )
370
-
371
- # we're still backfilling or timeseries is disabled
372
- return coverage_fallback_query (
373
- interval ,
374
- start_date = start_date ,
375
- end_date = end_date ,
376
- repository_id = repository .pk ,
377
- branch = branch or repository .branch ,
378
- )
362
+ trigger_backfill ([ dataset ] )
363
+
364
+ # we're still backfilling or timeseries is disabled
365
+ return coverage_fallback_query (
366
+ interval ,
367
+ start_date = start_date ,
368
+ end_date = end_date ,
369
+ repository_id = repository .pk ,
370
+ branch = branch or repository .branch ,
371
+ )
379
372
380
373
381
374
@sentry_sdk .trace
@@ -391,48 +384,44 @@ def owner_coverage_measurements_with_fallback(
391
384
If those are not available then we trigger a backfill and return computed results
392
385
directly from the primary database (much slower to query).
393
386
"""
394
- datasets = []
387
+ # we can't join across databases so we need to load all this into memory.
388
+ # select just the needed columns to keep this manageable
389
+ repos = Repository .objects .filter (repoid__in = repo_ids ).only ("repoid" , "branch" )
390
+
395
391
if settings .TIMESERIES_ENABLED :
396
392
datasets = Dataset .objects .filter (
397
393
name = MeasurementName .COVERAGE .value ,
398
394
repository_id__in = repo_ids ,
399
395
)
400
-
401
- all_backfilled = len (datasets ) == len (repo_ids ) and all (
402
- dataset .is_backfilled () for dataset in datasets
403
- )
404
-
405
- # we can't join across databases so we need to load all this into memory.
406
- # select just the needed columns to keep this manageable
407
- repos = Repository .objects .filter (repoid__in = repo_ids ).only ("repoid" , "branch" )
408
-
409
- if settings .TIMESERIES_ENABLED and all_backfilled :
410
- # timeseries data is ready
411
- return coverage_measurements (
412
- interval ,
413
- start_date = start_date ,
414
- end_date = end_date ,
415
- owner_id = owner .pk ,
416
- repos = repos ,
396
+ all_backfilled = len (datasets ) == len (repo_ids ) and all (
397
+ dataset .is_backfilled () for dataset in datasets
417
398
)
418
- else :
419
- if settings .TIMESERIES_ENABLED :
420
- # we need to backfill some datasets
421
- dataset_repo_ids = {dataset .repository_id for dataset in datasets }
422
- missing_dataset_repo_ids = set (repo_ids ) - dataset_repo_ids
423
- created_datasets = Dataset .objects .bulk_create (
424
- [
425
- Dataset (name = MeasurementName .COVERAGE .value , repository_id = repo_id )
426
- for repo_id in missing_dataset_repo_ids
427
- ]
399
+
400
+ if all_backfilled :
401
+ # timeseries data is ready
402
+ return coverage_measurements (
403
+ interval ,
404
+ start_date = start_date ,
405
+ end_date = end_date ,
406
+ owner_id = owner .pk ,
407
+ repos = repos ,
428
408
)
429
- for dataset in created_datasets :
430
- trigger_backfill ( dataset )
431
-
432
- # we're still backfilling or timeseries is disabled
433
- return coverage_fallback_query (
434
- interval ,
435
- start_date = start_date ,
436
- end_date = end_date ,
437
- repos = repos ,
409
+
410
+ # we need to backfill some datasets
411
+ dataset_repo_ids = { dataset . repository_id for dataset in datasets }
412
+ missing_dataset_repo_ids = set ( repo_ids ) - dataset_repo_ids
413
+ created_datasets = Dataset . objects . bulk_create (
414
+ [
415
+ Dataset ( name = MeasurementName . COVERAGE . value , repository_id = repo_id )
416
+ for repo_id in missing_dataset_repo_ids
417
+ ]
438
418
)
419
+ trigger_backfill (created_datasets )
420
+
421
+ # we're still backfilling or timeseries is disabled
422
+ return coverage_fallback_query (
423
+ interval ,
424
+ start_date = start_date ,
425
+ end_date = end_date ,
426
+ repos = repos ,
427
+ )
0 commit comments