Skip to content

Commit 0dd448f

Browse files
committed
add cleanup of CompareCommit and BA CommitReports
1 parent f0e2bce commit 0dd448f

File tree

1 file changed

+85
-71
lines changed

1 file changed

+85
-71
lines changed

services/cleanup/models.py

+85-71
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,15 @@
55
from functools import partial
66

77
from django.db.models import Model
8-
from django.db.models.query import Q, QuerySet
8+
from django.db.models.query import QuerySet
9+
from shared.bundle_analysis import StoragePaths
910
from shared.django_apps.core.models import Commit, Pull
10-
from shared.django_apps.reports.models import CommitReport, ReportDetails, ReportSession
11+
from shared.django_apps.reports.models import (
12+
CommitReport,
13+
CompareCommit,
14+
ReportDetails,
15+
ReportSession,
16+
)
1117

1218
from services.archive import ArchiveService, MinioEndpoints
1319
from services.cleanup.utils import CleanupContext
@@ -16,58 +22,75 @@
1622
DELETE_FILES_BATCHSIZE = 50
1723

1824

19-
# This has all the `Repository` fields needed by `get_archive_hash`
20-
@dataclasses.dataclass
21-
class FakeRepository:
22-
repoid: int
23-
service: str
24-
service_id: str
25-
26-
2725
def cleanup_files_batched(context: CleanupContext, paths: list[str]) -> int:
2826
cleaned_files = 0
2927

3028
# TODO: maybe reuse the executor across calls?
3129
with ThreadPoolExecutor() as e:
3230
for batched_paths in itertools.batched(paths, DELETE_FILES_BATCHSIZE):
33-
e.submit(context.storage.delete_files(context.bucket, batched_paths))
31+
e.submit(context.storage.delete_files, context.bucket, batched_paths)
3432

3533
cleaned_files += len(batched_paths)
3634

3735
return cleaned_files
3836

3937

40-
def cleanup_archivefield(
41-
field_name: str, context: CleanupContext, query: QuerySet
38+
def cleanup_with_storage_field(
39+
path_field: str,
40+
context: CleanupContext,
41+
query: QuerySet,
4242
) -> tuple[int, int]:
43-
model_field_name = f"_{field_name}_storage_path"
43+
cleaned_files = 0
4444

45-
# delete `None` `field_name`s right away
46-
cleaned_models = query.filter(**{f"{model_field_name}__isnull": True})._raw_delete(
45+
# delete `None` `path_field`s right away
46+
cleaned_models = query.filter(**{f"{path_field}__isnull": True})._raw_delete(
4747
query.db
4848
)
4949

50-
# and then delete all non-`None` `field_name`s:
51-
storage_query = query.filter(**{f"{model_field_name}__isnull": False})
52-
res = cleanup_with_storage_field(context, model_field_name, storage_query)
50+
# delete all those files from storage, using chunks based on the `id` column
51+
storage_query = query.filter(**{f"{path_field}__isnull": False}).order_by("id")
52+
53+
while True:
54+
storage_paths = storage_query.values_list(path_field, flat=True)[
55+
:MANUAL_QUERY_CHUNKSIZE
56+
]
57+
if len(storage_paths) == 0:
58+
break
59+
60+
cleaned_files += cleanup_files_batched(context, storage_paths)
61+
cleaned_models += query.filter(
62+
id__in=storage_query[:MANUAL_QUERY_CHUNKSIZE]
63+
)._raw_delete(query.db)
5364

54-
cleaned_models += res[0]
55-
cleaned_files = res[1]
5665
return (cleaned_models, cleaned_files)
5766

5867

68+
def cleanup_archivefield(
69+
field_name: str, context: CleanupContext, query: QuerySet
70+
) -> tuple[int, int]:
71+
model_field_name = f"_{field_name}_storage_path"
72+
73+
return cleanup_with_storage_field(model_field_name, context, query)
74+
75+
76+
# This has all the `Repository` fields needed by `get_archive_hash`
77+
@dataclasses.dataclass
78+
class FakeRepository:
79+
repoid: int
80+
service: str
81+
service_id: str
82+
83+
5984
def cleanup_commitreport(context: CleanupContext, query: QuerySet) -> tuple[int, int]:
60-
coverage_reports = (
61-
query.filter(Q(report_type=None) | Q(report_type="coverage"))
62-
.values_list(
63-
"code",
64-
"commit__commitid",
65-
"repository__repoid",
66-
"repository__owner__service",
67-
"repository__service_id",
68-
)
69-
.order_by("id")
70-
)
85+
coverage_reports = query.values_list(
86+
"report_type",
87+
"code",
88+
"external_id",
89+
"commit__commitid",
90+
"repository__repoid",
91+
"repository__owner__service",
92+
"repository__service_id",
93+
).order_by("id")
7194

7295
cleaned_models = 0
7396
cleaned_files = 0
@@ -80,7 +103,9 @@ def cleanup_commitreport(context: CleanupContext, query: QuerySet) -> tuple[int,
80103

81104
storage_paths: list[str] = []
82105
for (
106+
report_type,
83107
report_code,
108+
external_id,
84109
commit_sha,
85110
repoid,
86111
repo_service,
@@ -93,14 +118,30 @@ def cleanup_commitreport(context: CleanupContext, query: QuerySet) -> tuple[int,
93118
repo_hashes[repoid] = ArchiveService.get_archive_hash(fake_repo)
94119
repo_hash = repo_hashes[repoid]
95120

96-
chunks_file_name = report_code if report_code is not None else "chunks"
97-
path = MinioEndpoints.chunks.get_path(
98-
version="v4",
99-
repo_hash=repo_hash,
100-
commitid=commit_sha,
101-
chunks_file_name=chunks_file_name,
102-
)
103-
storage_paths.append(path)
121+
# depending on the `report_type`, we have:
122+
# - a `chunks` file for coverage
123+
# - a `bundle_report.sqlite` for BA
124+
match report_type:
125+
case "bundle_analysis":
126+
path = StoragePaths.bundle_report.path(
127+
repo_key=repo_hash, report_key=external_id
128+
)
129+
# TODO: bundle analysis lives in a different bucket I believe?
130+
storage_paths.append(path)
131+
case "test_results":
132+
# TODO:
133+
pass
134+
case _: # coverage
135+
chunks_file_name = (
136+
report_code if report_code is not None else "chunks"
137+
)
138+
path = MinioEndpoints.chunks.get_path(
139+
version="v4",
140+
repo_hash=repo_hash,
141+
commitid=commit_sha,
142+
chunks_file_name=chunks_file_name,
143+
)
144+
storage_paths.append(path)
104145

105146
cleaned_files += cleanup_files_batched(context, storage_paths)
106147
cleaned_models += query.filter(
@@ -110,34 +151,7 @@ def cleanup_commitreport(context: CleanupContext, query: QuerySet) -> tuple[int,
110151
return (cleaned_models, cleaned_files)
111152

112153

113-
def cleanup_upload(context: CleanupContext, query: QuerySet) -> tuple[int, int]:
114-
return cleanup_with_storage_field(context, "storage_path", query)
115-
116-
117-
def cleanup_with_storage_field(
118-
context: CleanupContext,
119-
path_field: str,
120-
query: QuerySet,
121-
) -> tuple[int, int]:
122-
cleaned_models = 0
123-
cleaned_files = 0
124-
125-
# delete all those files from storage, using chunks based on the `id` column
126-
storage_query = query.order_by("id")
127-
128-
while True:
129-
storage_paths = storage_query.values_list(path_field, flat=True)[
130-
:MANUAL_QUERY_CHUNKSIZE
131-
]
132-
if len(storage_paths) == 0:
133-
break
134-
135-
cleaned_files += cleanup_files_batched(context, storage_paths)
136-
cleaned_models += query.filter(
137-
id__in=storage_query[:MANUAL_QUERY_CHUNKSIZE]
138-
)._raw_delete(query.db)
139-
140-
return (cleaned_models, cleaned_files)
154+
# "v1/repos/{repo_key}/{report_key}/bundle_report.sqlite"
141155

142156

143157
# All the models that need custom python code for deletions so a bulk `DELETE` query does not work.
@@ -148,9 +162,9 @@ def cleanup_with_storage_field(
148162
Pull: partial(cleanup_archivefield, "flare"),
149163
ReportDetails: partial(cleanup_archivefield, "files_array"),
150164
CommitReport: cleanup_commitreport,
151-
ReportSession: cleanup_upload,
165+
ReportSession: partial(cleanup_with_storage_field, "storage_path"),
166+
CompareCommit: partial(cleanup_with_storage_field, "report_storage_path"),
152167
# TODO: figure out any other models which have files in storage that are not `ArchiveField`
153168
# TODO: TA is also storing files in GCS
154-
# TODO: BA is also storing files in GCS
155-
# TODO: There is also `CompareCommit.report_storage_path`, but that does not seem to be implemented as Django model?
169+
# TODO: profiling, label analysis and static analysis also needs porting to django
156170
}

0 commit comments

Comments
 (0)