Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement storage- and relationship-aware cleanup #973

Merged
merged 22 commits into from
Jan 22, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
bf0841b
Implement storage- and relationship-aware cleanup
Swatinem Dec 18, 2024
3b4cbaa
start working on batched/chunked deletes
Swatinem Jan 7, 2025
e43b561
use `_raw_delete`
Swatinem Jan 7, 2025
7b5b295
add cleanup of `CompareCommit` and BA `CommitReport`s
Swatinem Jan 9, 2025
e6226f7
rename to upload
Swatinem Jan 9, 2025
6031478
start rewriting FlushRepo in terms of cleanup
Swatinem Jan 14, 2025
9635aba
finish up migrating `FlushRepo`
Swatinem Jan 14, 2025
98edb6c
Reimplement `DeleteOwner` in terms of `cleanup`
Swatinem Jan 14, 2025
14cc6da
test a snapshot of the delete queries
Swatinem Jan 15, 2025
d83bbf6
rewrite relation builder to generate complete queries, and prefer sho…
Swatinem Jan 15, 2025
152e704
Move repository to shadow Owner at the start of cleanup task
Swatinem Jan 16, 2025
0d70211
also dump owner deletion queries, ignore certain relations
Swatinem Jan 16, 2025
2eb6a1d
add undocumented relations
Swatinem Jan 16, 2025
8ded8b5
Fix various typos and assumptions
Swatinem Jan 17, 2025
1d61cc3
support cleaning BA files which are stored in a different bucket
Swatinem Jan 17, 2025
90bf1e2
add more undocumented relations
Swatinem Jan 20, 2025
a726394
remove unused `delete_repo_files`
Swatinem Jan 21, 2025
15ef7d2
delete files one-by-one, and reuse the threadpool
Swatinem Jan 21, 2025
6693116
use new file cleanup within `FlareCleanup`
Swatinem Jan 21, 2025
5b6206c
fix `FlareCleanup` tests along with removing more unused `Archive/Sto…
Swatinem Jan 21, 2025
15aea8b
make sure to pick the correct connection for raw deletes
Swatinem Jan 22, 2025
0a22ed4
only re-parent non-`deleted` repos based on `service_id` in `SyncRepos`
Swatinem Jan 22, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,13 +278,17 @@ def mock_storage(mocker):

@pytest.fixture
def mock_archive_storage(mocker):
m = mocker.patch("shared.api_archive.archive.StorageService")
use_archive = mocker.patch(
"shared.django_apps.core.models.should_write_data_to_storage_config_check"
mocker.patch(
"shared.django_apps.core.models.should_write_data_to_storage_config_check",
return_value=True,
)
use_archive.return_value = True
storage_server = MemoryStorageService({})
m.return_value = storage_server
mocker.patch(
"shared.api_archive.archive.StorageService", return_value=storage_server
)
mocker.patch(
"shared.storage.get_appropriate_storage_service", return_value=storage_server
)
return storage_server


Expand Down
1 change: 1 addition & 0 deletions requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ sentry-sdk>=2.13.0
sentry-sdk[celery]
SQLAlchemy
SQLAlchemy-Utils
sqlparse
statsd
stripe>=11.4.1
time-machine
Expand Down
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,9 @@ sqlalchemy-utils==0.36.8
# -r requirements.in
# pytest-sqlalchemy
sqlparse==0.5.0
# via django
# via
# -r requirements.in
# django
statsd==3.3.0
# via -r requirements.in
stripe==11.4.1
Expand Down
16 changes: 0 additions & 16 deletions services/archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,22 +229,6 @@ def delete_file(self, path) -> None:
"""
self.storage.delete_file(self.root, path)

@sentry_sdk.trace()
def delete_files(self, paths: list[str]) -> list[bool]:
"""
Batch-deletes the gives list of files.
"""
return self.storage.delete_files(self.root, paths)

def delete_repo_files(self) -> int:
"""
Deletes an entire repository's contents
"""
path = "v4/repos/{}".format(self.storage_hash)
objects = self.storage.list_folder_contents(self.root, path)
results = self.storage.delete_files(self.root, [obj["name"] for obj in objects])
return len(results)

def read_chunks(self, commit_sha, report_code=None) -> str:
"""
Convenience method to read a chunks file from the archive.
Expand Down
Empty file added services/cleanup/__init__.py
Empty file.
56 changes: 56 additions & 0 deletions services/cleanup/cleanup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import logging

from django.db.models.query import QuerySet

from services.cleanup.models import MANUAL_CLEANUP
from services.cleanup.relations import build_relation_graph
from services.cleanup.utils import CleanupResult, CleanupSummary, cleanup_context

log = logging.getLogger(__name__)


def run_cleanup(
query: QuerySet,
) -> CleanupSummary:
"""
Cleans up all the models and storage files reachable from the given `QuerySet`.

This deletes all database models in topological sort order, and also removes
all the files in storage for any of the models in the relationship graph.

Returns the number of models and files being cleaned up in total, and per-Model.
"""
models_to_cleanup = build_relation_graph(query)

summary = {}
cleaned_models = 0
cleaned_files = 0

with cleanup_context() as context:
for model, query in models_to_cleanup:
# This is needed so that the correct connection is chosen for the
# `_raw_delete` queries, as otherwise it might chose a readonly connection.
query._for_write = True

manual_cleanup = MANUAL_CLEANUP.get(model)
if manual_cleanup is not None:
result = manual_cleanup(context, query)
else:
result = CleanupResult(query._raw_delete(query.db))

if result.cleaned_models > 0 or result.cleaned_files > 0:
summary[model] = result

log.info(
f"Finished cleaning up {model.__name__}",
extra={
"cleaned_models": result.cleaned_models,
"cleaned_files": result.cleaned_files,
},
)

cleaned_models += result.cleaned_models
cleaned_files += result.cleaned_files

totals = CleanupResult(cleaned_models, cleaned_files)
return CleanupSummary(totals, summary)
193 changes: 193 additions & 0 deletions services/cleanup/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
import dataclasses
from collections import defaultdict
from collections.abc import Callable
from functools import partial

from django.db.models import Model
from django.db.models.query import QuerySet
from shared.bundle_analysis import StoragePaths
from shared.django_apps.compare.models import CommitComparison
from shared.django_apps.core.models import Commit, Pull
from shared.django_apps.profiling.models import ProfilingUpload
from shared.django_apps.reports.models import CommitReport, ReportDetails
from shared.django_apps.reports.models import ReportSession as Upload
from shared.django_apps.staticanalysis.models import StaticAnalysisSingleFileSnapshot

from services.archive import ArchiveService, MinioEndpoints
from services.cleanup.utils import CleanupContext, CleanupResult

MANUAL_QUERY_CHUNKSIZE = 5_000
Swatinem marked this conversation as resolved.
Show resolved Hide resolved
DELETE_FILES_BATCHSIZE = 50


def cleanup_files_batched(
context: CleanupContext, buckets_paths: dict[str, list[str]]
) -> int:
def delete_file(bucket_path: tuple[str, str]) -> bool:
return context.storage.delete_file(bucket_path[0], bucket_path[1])

iter = ((bucket, path) for bucket, paths in buckets_paths.items() for path in paths)
results = context.threadpool.map(delete_file, iter)
return sum(results)


def cleanup_with_storage_field(
path_field: str,
context: CleanupContext,
query: QuerySet,
) -> CleanupResult:
cleaned_files = 0

# delete `None` `path_field`s right away
cleaned_models = query.filter(**{f"{path_field}__isnull": True})._raw_delete(
query.db
)

# delete all those files from storage, using chunks based on the `id` column
storage_query = query.filter(**{f"{path_field}__isnull": False}).order_by("id")

while True:
storage_paths = storage_query.values_list(path_field, flat=True)[
:MANUAL_QUERY_CHUNKSIZE
]
if len(storage_paths) == 0:
break

cleaned_files += cleanup_files_batched(

Check warning on line 56 in services/cleanup/models.py

View check run for this annotation

Codecov Notifications / codecov/patch

services/cleanup/models.py#L56

Added line #L56 was not covered by tests
context, {context.default_bucket: storage_paths}
)
cleaned_models += query.filter(

Check warning on line 59 in services/cleanup/models.py

View check run for this annotation

Codecov Notifications / codecov/patch

services/cleanup/models.py#L59

Added line #L59 was not covered by tests
id__in=storage_query[:MANUAL_QUERY_CHUNKSIZE]
)._raw_delete(query.db)

return CleanupResult(cleaned_models, cleaned_files)


def cleanup_archivefield(
field_name: str, context: CleanupContext, query: QuerySet
) -> CleanupResult:
model_field_name = f"_{field_name}_storage_path"

return cleanup_with_storage_field(model_field_name, context, query)


# This has all the `Repository` fields needed by `get_archive_hash`
@dataclasses.dataclass
class FakeRepository:
repoid: int
service: str
service_id: str


def cleanup_commitreport(context: CleanupContext, query: QuerySet) -> CleanupResult:
coverage_reports = query.values_list(
"report_type",
"code",
"external_id",
"commit__commitid",
"commit__repository__repoid",
"commit__repository__author__service",
"commit__repository__service_id",
).order_by("id")

cleaned_models = 0
cleaned_files = 0
repo_hashes: dict[int, str] = {}

while True:
reports = coverage_reports[:MANUAL_QUERY_CHUNKSIZE]
if len(reports) == 0:
break

buckets_paths: dict[str, list[str]] = defaultdict(list)
for (
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[very nit] maybe a NamedTuple would make the code more ergonomic here (but less performant for sure)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don’t fully understand? Would you prefer to not destruct the tuple into variables right away, but rather access them through a NamedTuple object? I’m not sure that would make things more readable 🤔

report_type,
report_code,
external_id,
commit_sha,
repoid,
repo_service,
repo_service_id,
) in reports:
if repoid not in repo_hashes:
fake_repo = FakeRepository(
repoid=repoid, service=repo_service, service_id=repo_service_id
)
repo_hashes[repoid] = ArchiveService.get_archive_hash(fake_repo)
repo_hash = repo_hashes[repoid]

# depending on the `report_type`, we have:
# - a `chunks` file for coverage
# - a `bundle_report.sqlite` for BA
if report_type == "bundle_analysis":
path = StoragePaths.bundle_report.path(
repo_key=repo_hash, report_key=external_id
)
buckets_paths[context.bundleanalysis_bucket].append(path)
elif report_type == "test_results":
# TA has cached rollups, but those are based on `Branch`
pass

Check warning on line 129 in services/cleanup/models.py

View check run for this annotation

Codecov Notifications / codecov/patch

services/cleanup/models.py#L129

Added line #L129 was not covered by tests
else:
chunks_file_name = report_code if report_code is not None else "chunks"
path = MinioEndpoints.chunks.get_path(
version="v4",
repo_hash=repo_hash,
commitid=commit_sha,
chunks_file_name=chunks_file_name,
)
buckets_paths[context.default_bucket].append(path)

cleaned_files += cleanup_files_batched(context, buckets_paths)
cleaned_models += query.filter(
id__in=query.order_by("id")[:MANUAL_QUERY_CHUNKSIZE]
)._raw_delete(query.db)

return CleanupResult(cleaned_models, cleaned_files)


def cleanup_upload(context: CleanupContext, query: QuerySet) -> CleanupResult:
cleaned_files = 0

# delete `None` `storage_path`s right away
cleaned_models = query.filter(storage_path__isnull=True)._raw_delete(query.db)

# delete all those files from storage, using chunks based on the `id` column
storage_query = query.filter(storage_path__isnull=False).order_by("id")

while True:
uploads = storage_query.values_list("report__report_type", "storage_path")[
:MANUAL_QUERY_CHUNKSIZE
]
if len(uploads) == 0:
break

buckets_paths: dict[str, list[str]] = defaultdict(list)
for report_type, storage_path in uploads:
if report_type == "bundle_analysis":
buckets_paths[context.bundleanalysis_bucket].append(storage_path)
else:
buckets_paths[context.default_bucket].append(storage_path)

cleaned_files += cleanup_files_batched(context, buckets_paths)
cleaned_models += query.filter(
id__in=storage_query[:MANUAL_QUERY_CHUNKSIZE]
)._raw_delete(query.db)

return CleanupResult(cleaned_models, cleaned_files)


# All the models that need custom python code for deletions so a bulk `DELETE` query does not work.
MANUAL_CLEANUP: dict[
type[Model], Callable[[CleanupContext, QuerySet], CleanupResult]
] = {
Commit: partial(cleanup_archivefield, "report"),
Pull: partial(cleanup_archivefield, "flare"),
ReportDetails: partial(cleanup_archivefield, "files_array"),
CommitReport: cleanup_commitreport,
Upload: cleanup_upload,
CommitComparison: partial(cleanup_with_storage_field, "report_storage_path"),
ProfilingUpload: partial(cleanup_with_storage_field, "raw_upload_location"),
StaticAnalysisSingleFileSnapshot: partial(
cleanup_with_storage_field, "content_location"
),
}
54 changes: 54 additions & 0 deletions services/cleanup/owner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import logging

from django.db import transaction
from django.db.models import Q
from shared.django_apps.codecov_auth.models import Owner, OwnerProfile
from shared.django_apps.core.models import Commit, Pull, Repository

from services.cleanup.cleanup import run_cleanup
from services.cleanup.utils import CleanupSummary

log = logging.getLogger(__name__)

CLEAR_ARRAY_FIELDS = ["plan_activated_users", "organizations", "admins"]


def cleanup_owner(owner_id: int) -> CleanupSummary:
log.info("Started/Continuing Owner cleanup", extra={"owner_id": owner_id})

clear_owner_references(owner_id)
owner_query = Owner.objects.filter(ownerid=owner_id)
summary = run_cleanup(owner_query)

log.info("Owner cleanup finished", extra={"owner_id": owner_id, "summary": summary})
return summary


# TODO: maybe turn this into a `MANUAL_CLEANUP`?
def clear_owner_references(owner_id: int):
"""
This clears the `ownerid` from various DB arrays where it is being referenced.
"""

OwnerProfile.objects.filter(default_org=owner_id).update(default_org=None)
Swatinem marked this conversation as resolved.
Show resolved Hide resolved
Owner.objects.filter(bot=owner_id).update(bot=None)
Repository.objects.filter(bot=owner_id).update(bot=None)
Commit.objects.filter(author=owner_id).update(author=None)
Pull.objects.filter(author=owner_id).update(author=None)

# This uses a transaction / `select_for_update` to ensure consistency when
# modifying these `ArrayField`s in python.
# I don’t think we have such consistency anyplace else in the codebase, so
# if this is causing lock contention issues, its also fair to avoid this.
with transaction.atomic():
filter = Q()
for field in CLEAR_ARRAY_FIELDS:
filter = filter | Q(**{f"{field}__contains": [owner_id]})

owners_with_reference = Owner.objects.select_for_update().filter(filter)
for owner in owners_with_reference:
for field in CLEAR_ARRAY_FIELDS:
array = getattr(owner, field)
setattr(owner, field, [x for x in array if x != owner_id])

owner.save(update_fields=CLEAR_ARRAY_FIELDS)
Loading
Loading