add cleanup of CompareCommit and BA CommitReports

Swatinem · Swatinem · commit 0dd448fb288c · 2025-01-14T10:00:32.000+01:00
diff --git a/services/cleanup/models.py b/services/cleanup/models.py
@@ -5,9 +5,15 @@
 from functools import partial
 
 from django.db.models import Model
-from django.db.models.query import Q, QuerySet
+from django.db.models.query import QuerySet
+from shared.bundle_analysis import StoragePaths
 from shared.django_apps.core.models import Commit, Pull
-from shared.django_apps.reports.models import CommitReport, ReportDetails, ReportSession
+from shared.django_apps.reports.models import (
+    CommitReport,
+    CompareCommit,
+    ReportDetails,
+    ReportSession,
+)
 
 from services.archive import ArchiveService, MinioEndpoints
 from services.cleanup.utils import CleanupContext
@@ -16,58 +22,75 @@
 DELETE_FILES_BATCHSIZE = 50
 
 
-# This has all the `Repository` fields needed by `get_archive_hash`
-@dataclasses.dataclass
-class FakeRepository:
-    repoid: int
-    service: str
-    service_id: str
-
-
 def cleanup_files_batched(context: CleanupContext, paths: list[str]) -> int:
     cleaned_files = 0
 
     # TODO: maybe reuse the executor across calls?
     with ThreadPoolExecutor() as e:
         for batched_paths in itertools.batched(paths, DELETE_FILES_BATCHSIZE):
-            e.submit(context.storage.delete_files(context.bucket, batched_paths))
+            e.submit(context.storage.delete_files, context.bucket, batched_paths)
 
         cleaned_files += len(batched_paths)
 
     return cleaned_files
 
 
-def cleanup_archivefield(
-    field_name: str, context: CleanupContext, query: QuerySet
+def cleanup_with_storage_field(
+    path_field: str,
+    context: CleanupContext,
+    query: QuerySet,
 ) -> tuple[int, int]:
-    model_field_name = f"_{field_name}_storage_path"
+    cleaned_files = 0
 
-    # delete `None` `field_name`s right away
-    cleaned_models = query.filter(**{f"{model_field_name}__isnull": True})._raw_delete(
+    # delete `None` `path_field`s right away
+    cleaned_models = query.filter(**{f"{path_field}__isnull": True})._raw_delete(
         query.db
     )
 
-    # and then delete all non-`None` `field_name`s:
-    storage_query = query.filter(**{f"{model_field_name}__isnull": False})
-    res = cleanup_with_storage_field(context, model_field_name, storage_query)
+    # delete all those files from storage, using chunks based on the `id` column
+    storage_query = query.filter(**{f"{path_field}__isnull": False}).order_by("id")
+
+    while True:
+        storage_paths = storage_query.values_list(path_field, flat=True)[
+            :MANUAL_QUERY_CHUNKSIZE
+        ]
+        if len(storage_paths) == 0:
+            break
+
+        cleaned_files += cleanup_files_batched(context, storage_paths)
+        cleaned_models += query.filter(
+            id__in=storage_query[:MANUAL_QUERY_CHUNKSIZE]
+        )._raw_delete(query.db)
 
-    cleaned_models += res[0]
-    cleaned_files = res[1]
     return (cleaned_models, cleaned_files)
 
 
+def cleanup_archivefield(
+    field_name: str, context: CleanupContext, query: QuerySet
+) -> tuple[int, int]:
+    model_field_name = f"_{field_name}_storage_path"
+
+    return cleanup_with_storage_field(model_field_name, context, query)
+
+
+# This has all the `Repository` fields needed by `get_archive_hash`
+@dataclasses.dataclass
+class FakeRepository:
+    repoid: int
+    service: str
+    service_id: str
+
+
 def cleanup_commitreport(context: CleanupContext, query: QuerySet) -> tuple[int, int]:
-    coverage_reports = (
-        query.filter(Q(report_type=None) | Q(report_type="coverage"))
-        .values_list(
-            "code",
-            "commit__commitid",
-            "repository__repoid",
-            "repository__owner__service",
-            "repository__service_id",
-        )
-        .order_by("id")
-    )
+    coverage_reports = query.values_list(
+        "report_type",
+        "code",
+        "external_id",
+        "commit__commitid",
+        "repository__repoid",
+        "repository__owner__service",
+        "repository__service_id",
+    ).order_by("id")
 
     cleaned_models = 0
     cleaned_files = 0
@@ -80,7 +103,9 @@ def cleanup_commitreport(context: CleanupContext, query: QuerySet) -> tuple[int,
 
         storage_paths: list[str] = []
         for (
+            report_type,
             report_code,
+            external_id,
             commit_sha,
             repoid,
             repo_service,
@@ -93,14 +118,30 @@ def cleanup_commitreport(context: CleanupContext, query: QuerySet) -> tuple[int,
                 repo_hashes[repoid] = ArchiveService.get_archive_hash(fake_repo)
             repo_hash = repo_hashes[repoid]
 
-            chunks_file_name = report_code if report_code is not None else "chunks"
-            path = MinioEndpoints.chunks.get_path(
-                version="v4",
-                repo_hash=repo_hash,
-                commitid=commit_sha,
-                chunks_file_name=chunks_file_name,
-            )
-            storage_paths.append(path)
+            # depending on the `report_type`, we have:
+            # - a `chunks` file for coverage
+            # - a `bundle_report.sqlite` for BA
+            match report_type:
+                case "bundle_analysis":
+                    path = StoragePaths.bundle_report.path(
+                        repo_key=repo_hash, report_key=external_id
+                    )
+                    # TODO: bundle analysis lives in a different bucket I believe?
+                    storage_paths.append(path)
+                case "test_results":
+                    # TODO:
+                    pass
+                case _:  # coverage
+                    chunks_file_name = (
+                        report_code if report_code is not None else "chunks"
+                    )
+                    path = MinioEndpoints.chunks.get_path(
+                        version="v4",
+                        repo_hash=repo_hash,
+                        commitid=commit_sha,
+                        chunks_file_name=chunks_file_name,
+                    )
+                    storage_paths.append(path)
 
         cleaned_files += cleanup_files_batched(context, storage_paths)
         cleaned_models += query.filter(
@@ -110,34 +151,7 @@ def cleanup_commitreport(context: CleanupContext, query: QuerySet) -> tuple[int,
     return (cleaned_models, cleaned_files)
 
 
-def cleanup_upload(context: CleanupContext, query: QuerySet) -> tuple[int, int]:
-    return cleanup_with_storage_field(context, "storage_path", query)
-
-
-def cleanup_with_storage_field(
-    context: CleanupContext,
-    path_field: str,
-    query: QuerySet,
-) -> tuple[int, int]:
-    cleaned_models = 0
-    cleaned_files = 0
-
-    # delete all those files from storage, using chunks based on the `id` column
-    storage_query = query.order_by("id")
-
-    while True:
-        storage_paths = storage_query.values_list(path_field, flat=True)[
-            :MANUAL_QUERY_CHUNKSIZE
-        ]
-        if len(storage_paths) == 0:
-            break
-
-        cleaned_files += cleanup_files_batched(context, storage_paths)
-        cleaned_models += query.filter(
-            id__in=storage_query[:MANUAL_QUERY_CHUNKSIZE]
-        )._raw_delete(query.db)
-
-    return (cleaned_models, cleaned_files)
+# "v1/repos/{repo_key}/{report_key}/bundle_report.sqlite"
 
 
 # All the models that need custom python code for deletions so a bulk `DELETE` query does not work.
@@ -148,9 +162,9 @@ def cleanup_with_storage_field(
     Pull: partial(cleanup_archivefield, "flare"),
     ReportDetails: partial(cleanup_archivefield, "files_array"),
     CommitReport: cleanup_commitreport,
-    ReportSession: cleanup_upload,
+    ReportSession: partial(cleanup_with_storage_field, "storage_path"),
+    CompareCommit: partial(cleanup_with_storage_field, "report_storage_path"),
     # TODO: figure out any other models which have files in storage that are not `ArchiveField`
     # TODO: TA is also storing files in GCS
-    # TODO: BA is also storing files in GCS
-    # TODO: There is also `CompareCommit.report_storage_path`, but that does not seem to be implemented as Django model?
+    # TODO: profiling, label analysis and static analysis also needs porting to django
 }