5
5
from functools import partial
6
6
7
7
from django .db .models import Model
8
- from django .db .models .query import Q , QuerySet
8
+ from django .db .models .query import QuerySet
9
+ from shared .bundle_analysis import StoragePaths
9
10
from shared .django_apps .core .models import Commit , Pull
10
- from shared .django_apps .reports .models import CommitReport , ReportDetails , ReportSession
11
+ from shared .django_apps .reports .models import (
12
+ CommitReport ,
13
+ CompareCommit ,
14
+ ReportDetails ,
15
+ ReportSession ,
16
+ )
11
17
12
18
from services .archive import ArchiveService , MinioEndpoints
13
19
from services .cleanup .utils import CleanupContext
16
22
DELETE_FILES_BATCHSIZE = 50
17
23
18
24
19
- # This has all the `Repository` fields needed by `get_archive_hash`
20
- @dataclasses .dataclass
21
- class FakeRepository :
22
- repoid : int
23
- service : str
24
- service_id : str
25
-
26
-
27
25
def cleanup_files_batched (context : CleanupContext , paths : list [str ]) -> int :
28
26
cleaned_files = 0
29
27
30
28
# TODO: maybe reuse the executor across calls?
31
29
with ThreadPoolExecutor () as e :
32
30
for batched_paths in itertools .batched (paths , DELETE_FILES_BATCHSIZE ):
33
- e .submit (context .storage .delete_files ( context .bucket , batched_paths ) )
31
+ e .submit (context .storage .delete_files , context .bucket , batched_paths )
34
32
35
33
cleaned_files += len (batched_paths )
36
34
37
35
return cleaned_files
38
36
39
37
40
- def cleanup_archivefield (
41
- field_name : str , context : CleanupContext , query : QuerySet
38
+ def cleanup_with_storage_field (
39
+ path_field : str ,
40
+ context : CleanupContext ,
41
+ query : QuerySet ,
42
42
) -> tuple [int , int ]:
43
- model_field_name = f"_ { field_name } _storage_path"
43
+ cleaned_files = 0
44
44
45
- # delete `None` `field_name `s right away
46
- cleaned_models = query .filter (** {f"{ model_field_name } __isnull" : True })._raw_delete (
45
+ # delete `None` `path_field `s right away
46
+ cleaned_models = query .filter (** {f"{ path_field } __isnull" : True })._raw_delete (
47
47
query .db
48
48
)
49
49
50
- # and then delete all non-`None` `field_name`s:
51
- storage_query = query .filter (** {f"{ model_field_name } __isnull" : False })
52
- res = cleanup_with_storage_field (context , model_field_name , storage_query )
50
+ # delete all those files from storage, using chunks based on the `id` column
51
+ storage_query = query .filter (** {f"{ path_field } __isnull" : False }).order_by ("id" )
52
+
53
+ while True :
54
+ storage_paths = storage_query .values_list (path_field , flat = True )[
55
+ :MANUAL_QUERY_CHUNKSIZE
56
+ ]
57
+ if len (storage_paths ) == 0 :
58
+ break
59
+
60
+ cleaned_files += cleanup_files_batched (context , storage_paths )
61
+ cleaned_models += query .filter (
62
+ id__in = storage_query [:MANUAL_QUERY_CHUNKSIZE ]
63
+ )._raw_delete (query .db )
53
64
54
- cleaned_models += res [0 ]
55
- cleaned_files = res [1 ]
56
65
return (cleaned_models , cleaned_files )
57
66
58
67
68
+ def cleanup_archivefield (
69
+ field_name : str , context : CleanupContext , query : QuerySet
70
+ ) -> tuple [int , int ]:
71
+ model_field_name = f"_{ field_name } _storage_path"
72
+
73
+ return cleanup_with_storage_field (model_field_name , context , query )
74
+
75
+
76
+ # This has all the `Repository` fields needed by `get_archive_hash`
77
+ @dataclasses .dataclass
78
+ class FakeRepository :
79
+ repoid : int
80
+ service : str
81
+ service_id : str
82
+
83
+
59
84
def cleanup_commitreport (context : CleanupContext , query : QuerySet ) -> tuple [int , int ]:
60
- coverage_reports = (
61
- query .filter (Q (report_type = None ) | Q (report_type = "coverage" ))
62
- .values_list (
63
- "code" ,
64
- "commit__commitid" ,
65
- "repository__repoid" ,
66
- "repository__owner__service" ,
67
- "repository__service_id" ,
68
- )
69
- .order_by ("id" )
70
- )
85
+ coverage_reports = query .values_list (
86
+ "report_type" ,
87
+ "code" ,
88
+ "external_id" ,
89
+ "commit__commitid" ,
90
+ "repository__repoid" ,
91
+ "repository__owner__service" ,
92
+ "repository__service_id" ,
93
+ ).order_by ("id" )
71
94
72
95
cleaned_models = 0
73
96
cleaned_files = 0
@@ -80,7 +103,9 @@ def cleanup_commitreport(context: CleanupContext, query: QuerySet) -> tuple[int,
80
103
81
104
storage_paths : list [str ] = []
82
105
for (
106
+ report_type ,
83
107
report_code ,
108
+ external_id ,
84
109
commit_sha ,
85
110
repoid ,
86
111
repo_service ,
@@ -93,14 +118,30 @@ def cleanup_commitreport(context: CleanupContext, query: QuerySet) -> tuple[int,
93
118
repo_hashes [repoid ] = ArchiveService .get_archive_hash (fake_repo )
94
119
repo_hash = repo_hashes [repoid ]
95
120
96
- chunks_file_name = report_code if report_code is not None else "chunks"
97
- path = MinioEndpoints .chunks .get_path (
98
- version = "v4" ,
99
- repo_hash = repo_hash ,
100
- commitid = commit_sha ,
101
- chunks_file_name = chunks_file_name ,
102
- )
103
- storage_paths .append (path )
121
+ # depending on the `report_type`, we have:
122
+ # - a `chunks` file for coverage
123
+ # - a `bundle_report.sqlite` for BA
124
+ match report_type :
125
+ case "bundle_analysis" :
126
+ path = StoragePaths .bundle_report .path (
127
+ repo_key = repo_hash , report_key = external_id
128
+ )
129
+ # TODO: bundle analysis lives in a different bucket I believe?
130
+ storage_paths .append (path )
131
+ case "test_results" :
132
+ # TODO:
133
+ pass
134
+ case _: # coverage
135
+ chunks_file_name = (
136
+ report_code if report_code is not None else "chunks"
137
+ )
138
+ path = MinioEndpoints .chunks .get_path (
139
+ version = "v4" ,
140
+ repo_hash = repo_hash ,
141
+ commitid = commit_sha ,
142
+ chunks_file_name = chunks_file_name ,
143
+ )
144
+ storage_paths .append (path )
104
145
105
146
cleaned_files += cleanup_files_batched (context , storage_paths )
106
147
cleaned_models += query .filter (
@@ -110,34 +151,7 @@ def cleanup_commitreport(context: CleanupContext, query: QuerySet) -> tuple[int,
110
151
return (cleaned_models , cleaned_files )
111
152
112
153
113
- def cleanup_upload (context : CleanupContext , query : QuerySet ) -> tuple [int , int ]:
114
- return cleanup_with_storage_field (context , "storage_path" , query )
115
-
116
-
117
- def cleanup_with_storage_field (
118
- context : CleanupContext ,
119
- path_field : str ,
120
- query : QuerySet ,
121
- ) -> tuple [int , int ]:
122
- cleaned_models = 0
123
- cleaned_files = 0
124
-
125
- # delete all those files from storage, using chunks based on the `id` column
126
- storage_query = query .order_by ("id" )
127
-
128
- while True :
129
- storage_paths = storage_query .values_list (path_field , flat = True )[
130
- :MANUAL_QUERY_CHUNKSIZE
131
- ]
132
- if len (storage_paths ) == 0 :
133
- break
134
-
135
- cleaned_files += cleanup_files_batched (context , storage_paths )
136
- cleaned_models += query .filter (
137
- id__in = storage_query [:MANUAL_QUERY_CHUNKSIZE ]
138
- )._raw_delete (query .db )
139
-
140
- return (cleaned_models , cleaned_files )
154
+ # "v1/repos/{repo_key}/{report_key}/bundle_report.sqlite"
141
155
142
156
143
157
# All the models that need custom python code for deletions so a bulk `DELETE` query does not work.
@@ -148,9 +162,9 @@ def cleanup_with_storage_field(
148
162
Pull : partial (cleanup_archivefield , "flare" ),
149
163
ReportDetails : partial (cleanup_archivefield , "files_array" ),
150
164
CommitReport : cleanup_commitreport ,
151
- ReportSession : cleanup_upload ,
165
+ ReportSession : partial (cleanup_with_storage_field , "storage_path" ),
166
+ CompareCommit : partial (cleanup_with_storage_field , "report_storage_path" ),
152
167
# TODO: figure out any other models which have files in storage that are not `ArchiveField`
153
168
# TODO: TA is also storing files in GCS
154
- # TODO: BA is also storing files in GCS
155
- # TODO: There is also `CompareCommit.report_storage_path`, but that does not seem to be implemented as Django model?
169
+ # TODO: profiling, label analysis and static analysis also needs porting to django
156
170
}
0 commit comments