Skip to content

Commit

Permalink
add cleanup of CompareCommit and BA CommitReports
Browse files Browse the repository at this point in the history
  • Loading branch information
Swatinem committed Jan 9, 2025
1 parent 389d49e commit 926916d
Show file tree
Hide file tree
Showing 3 changed files with 89 additions and 75 deletions.
2 changes: 1 addition & 1 deletion requirements.in
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
https://github.com/codecov/test-results-parser/archive/996ecb2aaf7767bf4c2944c75835c1ee1eb2b566.tar.gz#egg=test-results-parser
https://github.com/codecov/shared/archive/67879c17712e42067f67d3b60dd0e2c1a5db5170.tar.gz#egg=shared
https://github.com/codecov/shared/archive/eb0a3afc419c1e3261eac2e29e25650b58bda5eb.tar.gz#egg=shared
https://github.com/codecov/timestring/archive/d37ceacc5954dff3b5bd2f887936a98a668dda42.tar.gz#egg=timestring
asgiref>=3.7.2
analytics-python==1.3.0b1
Expand Down
6 changes: 3 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -248,11 +248,11 @@ pyasn1-modules==0.2.8
# via google-auth
pycparser==2.20
# via cffi
pydantic==2.10.3
pydantic==2.10.4
# via
# -r requirements.in
# openai
pydantic-core==2.27.1
pydantic-core==2.27.2
# via pydantic
pyjwt==2.10.0
# via
Expand Down Expand Up @@ -336,7 +336,7 @@ sentry-sdk==2.13.0
# shared
setuptools==75.6.0
# via nodeenv
shared @ https://github.com/codecov/shared/archive/67879c17712e42067f67d3b60dd0e2c1a5db5170.tar.gz#egg=shared
shared @ https://github.com/codecov/shared/archive/eb0a3afc419c1e3261eac2e29e25650b58bda5eb.tar.gz#egg=shared
# via -r requirements.in
six==1.16.0
# via
Expand Down
156 changes: 85 additions & 71 deletions services/cleanup/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,15 @@
from functools import partial

from django.db.models import Model
from django.db.models.query import Q, QuerySet
from django.db.models.query import QuerySet
from shared.bundle_analysis import StoragePaths
from shared.django_apps.core.models import Commit, Pull
from shared.django_apps.reports.models import CommitReport, ReportDetails, ReportSession
from shared.django_apps.reports.models import (
CommitReport,
CompareCommit,
ReportDetails,
ReportSession,
)

from services.archive import ArchiveService, MinioEndpoints
from services.cleanup.utils import CleanupContext
Expand All @@ -16,58 +22,75 @@
DELETE_FILES_BATCHSIZE = 50


# This has all the `Repository` fields needed by `get_archive_hash`
@dataclasses.dataclass
class FakeRepository:
repoid: int
service: str
service_id: str


def cleanup_files_batched(context: CleanupContext, paths: list[str]) -> int:
cleaned_files = 0

# TODO: maybe reuse the executor across calls?
with ThreadPoolExecutor() as e:
for batched_paths in itertools.batched(paths, DELETE_FILES_BATCHSIZE):
e.submit(context.storage.delete_files(context.bucket, batched_paths))
e.submit(context.storage.delete_files, context.bucket, batched_paths)

cleaned_files += len(batched_paths)

return cleaned_files


def cleanup_archivefield(
field_name: str, context: CleanupContext, query: QuerySet
def cleanup_with_storage_field(
path_field: str,
context: CleanupContext,
query: QuerySet,
) -> tuple[int, int]:
model_field_name = f"_{field_name}_storage_path"
cleaned_files = 0

# delete `None` `field_name`s right away
cleaned_models = query.filter(**{f"{model_field_name}__isnull": True})._raw_delete(
# delete `None` `path_field`s right away
cleaned_models = query.filter(**{f"{path_field}__isnull": True})._raw_delete(
query.db
)

# and then delete all non-`None` `field_name`s:
storage_query = query.filter(**{f"{model_field_name}__isnull": False})
res = cleanup_with_storage_field(context, model_field_name, storage_query)
# delete all those files from storage, using chunks based on the `id` column
storage_query = query.filter(**{f"{path_field}__isnull": False}).order_by("id")

while True:
storage_paths = storage_query.values_list(path_field, flat=True)[
:MANUAL_QUERY_CHUNKSIZE
]
if len(storage_paths) == 0:
break

cleaned_files += cleanup_files_batched(context, storage_paths)
cleaned_models += query.filter(
id__in=storage_query[:MANUAL_QUERY_CHUNKSIZE]
)._raw_delete(query.db)

cleaned_models += res[0]
cleaned_files = res[1]
return (cleaned_models, cleaned_files)


def cleanup_archivefield(
field_name: str, context: CleanupContext, query: QuerySet
) -> tuple[int, int]:
model_field_name = f"_{field_name}_storage_path"

return cleanup_with_storage_field(model_field_name, context, query)


# This has all the `Repository` fields needed by `get_archive_hash`
@dataclasses.dataclass
class FakeRepository:
repoid: int
service: str
service_id: str


def cleanup_commitreport(context: CleanupContext, query: QuerySet) -> tuple[int, int]:
coverage_reports = (
query.filter(Q(report_type=None) | Q(report_type="coverage"))
.values_list(
"code",
"commit__commitid",
"repository__repoid",
"repository__owner__service",
"repository__service_id",
)
.order_by("id")
)
coverage_reports = query.values_list(
"report_type",
"code",
"external_id",
"commit__commitid",
"repository__repoid",
"repository__owner__service",
"repository__service_id",
).order_by("id")

cleaned_models = 0
cleaned_files = 0
Expand All @@ -80,7 +103,9 @@ def cleanup_commitreport(context: CleanupContext, query: QuerySet) -> tuple[int,

storage_paths: list[str] = []
for (
report_type,
report_code,
external_id,
commit_sha,
repoid,
repo_service,
Expand All @@ -93,14 +118,30 @@ def cleanup_commitreport(context: CleanupContext, query: QuerySet) -> tuple[int,
repo_hashes[repoid] = ArchiveService.get_archive_hash(fake_repo)
repo_hash = repo_hashes[repoid]

chunks_file_name = report_code if report_code is not None else "chunks"
path = MinioEndpoints.chunks.get_path(
version="v4",
repo_hash=repo_hash,
commitid=commit_sha,
chunks_file_name=chunks_file_name,
)
storage_paths.append(path)
# depending on the `report_type`, we have:
# - a `chunks` file for coverage
# - a `bundle_report.sqlite` for BA
match report_type:
case "bundle_analysis":
path = StoragePaths.bundle_report.path(
repo_key=repo_hash, report_key=external_id
)
# TODO: bundle analysis lives in a different bucket I believe?
storage_paths.append(path)
case "test_results":
# TODO:
pass
case _: # coverage
chunks_file_name = (
report_code if report_code is not None else "chunks"
)
path = MinioEndpoints.chunks.get_path(
version="v4",
repo_hash=repo_hash,
commitid=commit_sha,
chunks_file_name=chunks_file_name,
)
storage_paths.append(path)

cleaned_files += cleanup_files_batched(context, storage_paths)
cleaned_models += query.filter(
Expand All @@ -110,34 +151,7 @@ def cleanup_commitreport(context: CleanupContext, query: QuerySet) -> tuple[int,
return (cleaned_models, cleaned_files)


def cleanup_upload(context: CleanupContext, query: QuerySet) -> tuple[int, int]:
return cleanup_with_storage_field(context, "storage_path", query)


def cleanup_with_storage_field(
context: CleanupContext,
path_field: str,
query: QuerySet,
) -> tuple[int, int]:
cleaned_models = 0
cleaned_files = 0

# delete all those files from storage, using chunks based on the `id` column
storage_query = query.order_by("id")

while True:
storage_paths = storage_query.values_list(path_field, flat=True)[
:MANUAL_QUERY_CHUNKSIZE
]
if len(storage_paths) == 0:
break

cleaned_files += cleanup_files_batched(context, storage_paths)
cleaned_models += query.filter(
id__in=storage_query[:MANUAL_QUERY_CHUNKSIZE]
)._raw_delete(query.db)

return (cleaned_models, cleaned_files)
# "v1/repos/{repo_key}/{report_key}/bundle_report.sqlite"


# All the models that need custom python code for deletions so a bulk `DELETE` query does not work.
Expand All @@ -148,9 +162,9 @@ def cleanup_with_storage_field(
Pull: partial(cleanup_archivefield, "flare"),
ReportDetails: partial(cleanup_archivefield, "files_array"),
CommitReport: cleanup_commitreport,
ReportSession: cleanup_upload,
ReportSession: partial(cleanup_with_storage_field, "storage_path"),
CompareCommit: partial(cleanup_with_storage_field, "report_storage_path"),
# TODO: figure out any other models which have files in storage that are not `ArchiveField`
# TODO: TA is also storing files in GCS
# TODO: BA is also storing files in GCS
# TODO: There is also `CompareCommit.report_storage_path`, but that does not seem to be implemented as Django model?
# TODO: profiling, label analysis and static analysis also needs porting to django
}

0 comments on commit 926916d

Please sign in to comment.