Skip to content

Commit 1d61cc3

Browse files
committed
support cleaning BA files which are stored in a different bucket
1 parent 8ded8b5 commit 1d61cc3

File tree

3 files changed

+98
-42
lines changed

3 files changed

+98
-42
lines changed

services/cleanup/models.py

+69-34
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import dataclasses
22
import itertools
3+
from collections import defaultdict
34
from collections.abc import Callable
45
from concurrent.futures import ThreadPoolExecutor
56
from functools import partial
@@ -9,8 +10,10 @@
910
from shared.bundle_analysis import StoragePaths
1011
from shared.django_apps.compare.models import CommitComparison
1112
from shared.django_apps.core.models import Commit, Pull
13+
from shared.django_apps.profiling.models import ProfilingUpload
1214
from shared.django_apps.reports.models import CommitReport, ReportDetails
1315
from shared.django_apps.reports.models import ReportSession as Upload
16+
from shared.django_apps.staticanalysis.models import StaticAnalysisSingleFileSnapshot
1417

1518
from services.archive import ArchiveService, MinioEndpoints
1619
from services.cleanup.utils import CleanupContext, CleanupResult
@@ -19,15 +22,17 @@
1922
DELETE_FILES_BATCHSIZE = 50
2023

2124

22-
def cleanup_files_batched(context: CleanupContext, paths: list[str]) -> int:
25+
def cleanup_files_batched(
26+
context: CleanupContext, buckets_paths: dict[str, list[str]]
27+
) -> int:
2328
cleaned_files = 0
2429

2530
# TODO: maybe reuse the executor across calls?
2631
with ThreadPoolExecutor() as e:
27-
for batched_paths in itertools.batched(paths, DELETE_FILES_BATCHSIZE):
28-
e.submit(context.storage.delete_files, context.bucket, list(batched_paths))
29-
30-
cleaned_files += len(batched_paths)
32+
for bucket, paths in buckets_paths.items():
33+
for batched_paths in itertools.batched(paths, DELETE_FILES_BATCHSIZE):
34+
e.submit(context.storage.delete_files, bucket, list(batched_paths))
35+
cleaned_files += len(paths)
3136

3237
return cleaned_files
3338

@@ -54,7 +59,9 @@ def cleanup_with_storage_field(
5459
if len(storage_paths) == 0:
5560
break
5661

57-
cleaned_files += cleanup_files_batched(context, storage_paths)
62+
cleaned_files += cleanup_files_batched(
63+
context, {context.default_bucket: storage_paths}
64+
)
5865
cleaned_models += query.filter(
5966
id__in=storage_query[:MANUAL_QUERY_CHUNKSIZE]
6067
)._raw_delete(query.db)
@@ -98,7 +105,7 @@ def cleanup_commitreport(context: CleanupContext, query: QuerySet) -> CleanupRes
98105
if len(reports) == 0:
99106
break
100107

101-
storage_paths: list[str] = []
108+
buckets_paths: dict[str, list[str]] = defaultdict(list)
102109
for (
103110
report_type,
104111
report_code,
@@ -118,36 +125,63 @@ def cleanup_commitreport(context: CleanupContext, query: QuerySet) -> CleanupRes
118125
# depending on the `report_type`, we have:
119126
# - a `chunks` file for coverage
120127
# - a `bundle_report.sqlite` for BA
121-
match report_type:
122-
case "bundle_analysis":
123-
path = StoragePaths.bundle_report.path(
124-
repo_key=repo_hash, report_key=external_id
125-
)
126-
# TODO: bundle analysis lives in a different bucket I believe?
127-
storage_paths.append(path)
128-
case "test_results":
129-
# TODO:
130-
pass
131-
case _: # coverage
132-
chunks_file_name = (
133-
report_code if report_code is not None else "chunks"
134-
)
135-
path = MinioEndpoints.chunks.get_path(
136-
version="v4",
137-
repo_hash=repo_hash,
138-
commitid=commit_sha,
139-
chunks_file_name=chunks_file_name,
140-
)
141-
storage_paths.append(path)
142-
143-
cleaned_files += cleanup_files_batched(context, storage_paths)
128+
if report_type == "bundle_analysis":
129+
path = StoragePaths.bundle_report.path(
130+
repo_key=repo_hash, report_key=external_id
131+
)
132+
buckets_paths[context.bundleanalysis_bucket].append(path)
133+
elif report_type == "test_results":
134+
# TA has cached rollups, but those are based on `Branch`
135+
pass
136+
else:
137+
chunks_file_name = report_code if report_code is not None else "chunks"
138+
path = MinioEndpoints.chunks.get_path(
139+
version="v4",
140+
repo_hash=repo_hash,
141+
commitid=commit_sha,
142+
chunks_file_name=chunks_file_name,
143+
)
144+
buckets_paths[context.default_bucket].append(path)
145+
146+
cleaned_files += cleanup_files_batched(context, buckets_paths)
144147
cleaned_models += query.filter(
145148
id__in=query.order_by("id")[:MANUAL_QUERY_CHUNKSIZE]
146149
)._raw_delete(query.db)
147150

148151
return CleanupResult(cleaned_models, cleaned_files)
149152

150153

154+
def cleanup_upload(context: CleanupContext, query: QuerySet) -> CleanupResult:
155+
cleaned_files = 0
156+
157+
# delete `None` `storage_path`s right away
158+
cleaned_models = query.filter(storage_path__isnull=True)._raw_delete(query.db)
159+
160+
# delete all those files from storage, using chunks based on the `id` column
161+
storage_query = query.filter(storage_path__isnull=False).order_by("id")
162+
163+
while True:
164+
uploads = storage_query.values_list("report__report_type", "storage_path")[
165+
:MANUAL_QUERY_CHUNKSIZE
166+
]
167+
if len(uploads) == 0:
168+
break
169+
170+
buckets_paths: dict[str, list[str]] = defaultdict(list)
171+
for report_type, storage_path in uploads:
172+
if report_type == "bundle_analysis":
173+
buckets_paths[context.bundleanalysis_bucket].append(storage_path)
174+
else:
175+
buckets_paths[context.default_bucket].append(storage_path)
176+
177+
cleaned_files += cleanup_files_batched(context, buckets_paths)
178+
cleaned_models += query.filter(
179+
id__in=storage_query[:MANUAL_QUERY_CHUNKSIZE]
180+
)._raw_delete(query.db)
181+
182+
return CleanupResult(cleaned_models, cleaned_files)
183+
184+
151185
# All the models that need custom python code for deletions so a bulk `DELETE` query does not work.
152186
MANUAL_CLEANUP: dict[
153187
type[Model], Callable[[CleanupContext, QuerySet], CleanupResult]
@@ -156,9 +190,10 @@ def cleanup_commitreport(context: CleanupContext, query: QuerySet) -> CleanupRes
156190
Pull: partial(cleanup_archivefield, "flare"),
157191
ReportDetails: partial(cleanup_archivefield, "files_array"),
158192
CommitReport: cleanup_commitreport,
159-
Upload: partial(cleanup_with_storage_field, "storage_path"),
193+
Upload: cleanup_upload,
160194
CommitComparison: partial(cleanup_with_storage_field, "report_storage_path"),
161-
# TODO: figure out any other models which have files in storage that are not `ArchiveField`
162-
# TODO: TA is also storing files in GCS
163-
# TODO: profiling, label analysis and static analysis also needs porting to django
195+
ProfilingUpload: partial(cleanup_with_storage_field, "raw_upload_location"),
196+
StaticAnalysisSingleFileSnapshot: partial(
197+
cleanup_with_storage_field, "content_location"
198+
),
164199
}

services/cleanup/utils.py

+8-2
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,17 @@
88

99
class CleanupContext:
1010
storage: StorageService
11-
bucket: str
11+
default_bucket: str
12+
bundleanalysis_bucket: str
1213

1314
def __init__(self):
1415
self.storage = shared.storage.get_appropriate_storage_service()
15-
self.bucket = get_config("services", "minio", "bucket", default="archive")
16+
self.default_bucket = get_config(
17+
"services", "minio", "bucket", default="archive"
18+
)
19+
self.bundleanalysis_bucket = get_config(
20+
"bundle_analysis", "bucket_name", default="bundle-analysis"
21+
)
1622

1723

1824
@dataclasses.dataclass

tasks/tests/unit/test_flush_repo.py

+21-6
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import pytest
2+
from shared.bundle_analysis import StoragePaths
23
from shared.django_apps.compare.models import CommitComparison, FlagComparison
34
from shared.django_apps.compare.tests.factories import (
45
CommitComparisonFactory,
@@ -86,7 +87,6 @@ def test_flush_repo_few_of_each_only_db_objects(mock_storage):
8687
@pytest.mark.django_db
8788
def test_flush_repo_little_bit_of_everything(mocker, mock_storage):
8889
repo = RepositoryFactory()
89-
mocker.patch("services.cleanup.utils.StorageService")
9090
archive_service = ArchiveService(repo)
9191

9292
for i in range(8):
@@ -96,8 +96,20 @@ def test_flush_repo_little_bit_of_everything(mocker, mock_storage):
9696
commit_report = CommitReportFactory(commit=commit)
9797
upload = UploadFactory(report=commit_report, storage_path=f"upload{i}")
9898

99-
archive_service.write_chunks(commit.commitid, f"chunksdata{i}")
100-
archive_service.write_file(upload.storage_path, f"uploaddata{i}")
99+
archive_service.write_chunks(commit.commitid, f"chunks_data{i}")
100+
archive_service.write_file(upload.storage_path, f"upload_data{i}")
101+
102+
ba_report = CommitReportFactory(commit=commit, report_type="bundle_analysis")
103+
ba_upload = UploadFactory(report=ba_report, storage_path=f"ba_upload{i}")
104+
ba_report_path = StoragePaths.bundle_report.path(
105+
repo_key=archive_service.storage_hash, report_key=ba_report.external_id
106+
)
107+
archive_service.storage.write_file(
108+
"bundle-analysis", ba_report_path, f"ba_report_data{i}"
109+
)
110+
archive_service.storage.write_file(
111+
"bundle-analysis", ba_upload.storage_path, f"ba_upload_data{i}"
112+
)
101113

102114
for i in range(17):
103115
PullFactory(repository=repo, pullid=i + 100)
@@ -106,20 +118,23 @@ def test_flush_repo_little_bit_of_everything(mocker, mock_storage):
106118
BranchFactory(repository=repo)
107119

108120
archive = mock_storage.storage["archive"]
121+
ba_archive = mock_storage.storage["bundle-analysis"]
109122
assert len(archive) == 16
123+
assert len(ba_archive) == 16
110124

111125
task = FlushRepoTask()
112126
res = task.run_impl({}, repoid=repo.repoid)
113127

114128
assert res == CleanupSummary(
115-
CleanupResult(24 + 8 + 8 + 18 + 1 + 8, 16),
129+
CleanupResult(24 + 8 + 16 + 18 + 1 + 16, 16 + 16),
116130
{
117131
Branch: CleanupResult(24),
118132
Commit: CleanupResult(8),
119-
CommitReport: CleanupResult(8, 8),
133+
CommitReport: CleanupResult(16, 16),
120134
Pull: CleanupResult(18),
121135
Repository: CleanupResult(1),
122-
Upload: CleanupResult(8, 8),
136+
Upload: CleanupResult(16, 16),
123137
},
124138
)
125139
assert len(archive) == 0
140+
assert len(ba_archive) == 0

0 commit comments

Comments
 (0)