36
36
//! - Permanently deletes marked versions from the system database
37
37
//! - Input: Version file, versions to delete, unused S3 files
38
38
//! - Output: Deletion confirmation
39
+ //!
40
+ //!
41
+ //!
42
+ //! ListFilesAtVersionOperator
43
+ //! - input: version file & versions to check
44
+ //! - output: full paths of all S3 files used by specified versions
45
+ //!
39
46
47
+ use std:: collections:: HashSet ;
40
48
use std:: fmt:: { Debug , Formatter } ;
49
+ use std:: str:: FromStr ;
41
50
51
+ use crate :: operators:: fetch_lineage_file:: {
52
+ FetchLineageFileError , FetchLineageFileInput , FetchLineageFileOperator , FetchLineageFileOutput ,
53
+ } ;
42
54
use crate :: types:: CleanupMode ;
43
55
use async_trait:: async_trait;
44
56
use chroma_error:: { ChromaError , ErrorCodes } ;
@@ -83,6 +95,7 @@ use prost::Message;
83
95
84
96
pub struct GarbageCollectorOrchestrator {
85
97
collection_id : CollectionUuid ,
98
+ lineage_file_path : Option < String > ,
86
99
version_file_path : String ,
87
100
absolute_cutoff_time : DateTime < Utc > ,
88
101
sysdb_client : SysDb ,
@@ -114,9 +127,11 @@ pub struct GarbageCollectorResponse {
114
127
115
128
#[ allow( clippy:: too_many_arguments) ]
116
129
impl GarbageCollectorOrchestrator {
130
+ /// Lineage file path must be provided if this collection is part of a fork tree.
117
131
pub fn new (
118
132
collection_id : CollectionUuid ,
119
133
version_file_path : String ,
134
+ lineage_file_path : Option < String > ,
120
135
absolute_cutoff_time : DateTime < Utc > ,
121
136
sysdb_client : SysDb ,
122
137
dispatcher : ComponentHandle < Dispatcher > ,
@@ -126,6 +141,7 @@ impl GarbageCollectorOrchestrator {
126
141
Self {
127
142
collection_id,
128
143
version_file_path,
144
+ lineage_file_path,
129
145
absolute_cutoff_time,
130
146
sysdb_client,
131
147
dispatcher,
@@ -165,6 +181,8 @@ pub enum GarbageCollectorError {
165
181
Aborted ,
166
182
#[ error( "DeleteUnusedFiles error: {0}" ) ]
167
183
DeleteUnusedFiles ( #[ from] DeleteUnusedFilesError ) ,
184
+ #[ error( "FetchLineageFile error: {0}" ) ]
185
+ FetchLineageFile ( #[ from] FetchLineageFileError ) ,
168
186
}
169
187
170
188
impl ChromaError for GarbageCollectorError {
@@ -201,14 +219,15 @@ impl Orchestrator for GarbageCollectorOrchestrator {
201
219
"Creating initial fetch version file task"
202
220
) ;
203
221
204
- vec ! [ wrap(
205
- Box :: new( FetchVersionFileOperator { } ) ,
206
- FetchVersionFileInput {
207
- version_file_path: self . version_file_path. clone( ) ,
208
- storage: self . storage. clone( ) ,
209
- } ,
210
- ctx. receiver( ) ,
211
- ) ]
222
+ // vec![wrap(
223
+ // Box::new(FetchVersionFileOperator {}),
224
+ // FetchVersionFileInput {
225
+ // version_file_path: self.version_file_path.clone(),
226
+ // storage: self.storage.clone(),
227
+ // },
228
+ // ctx.receiver(),
229
+ // )]
230
+ vec ! [ ]
212
231
}
213
232
214
233
fn set_result_channel (
@@ -369,34 +388,76 @@ impl Handler<TaskResult<MarkVersionsAtSysDbOutput, MarkVersionsAtSysDbError>>
369
388
message : TaskResult < MarkVersionsAtSysDbOutput , MarkVersionsAtSysDbError > ,
370
389
ctx : & ComponentContext < GarbageCollectorOrchestrator > ,
371
390
) {
372
- // Stage 3: After marking versions, compute unused files
373
391
let output = match self . ok_or_terminate ( message. into_inner ( ) , ctx) . await {
374
392
Some ( output) => output,
375
393
None => return ,
376
394
} ;
377
395
378
- let compute_task = wrap (
379
- Box :: new ( ComputeUnusedFilesOperator :: new (
380
- self . collection_id . to_string ( ) ,
381
- self . storage . clone ( ) ,
382
- 2 , // min_versions_to_keep
383
- ) ) ,
384
- ComputeUnusedFilesInput {
385
- version_file : output. version_file ,
386
- versions_to_delete : output. versions_to_delete ,
387
- oldest_version_to_keep : output. oldest_version_to_keep ,
388
- } ,
389
- ctx. receiver ( ) ,
390
- ) ;
396
+ if let Some ( lineage_file_path) = self . lineage_file_path . clone ( ) {
397
+ let fetch_lineage_file_task = wrap (
398
+ Box :: new ( FetchLineageFileOperator :: new ( ) ) ,
399
+ FetchLineageFileInput :: new ( self . storage . clone ( ) , lineage_file_path) ,
400
+ ctx. receiver ( ) ,
401
+ ) ;
391
402
392
- if let Err ( e) = self
393
- . dispatcher ( )
394
- . send ( compute_task, Some ( Span :: current ( ) ) )
395
- . await
396
- {
397
- self . terminate_with_result ( Err ( GarbageCollectorError :: Channel ( e) ) , ctx)
398
- . await ;
399
- return ;
403
+ if let Err ( e) = self
404
+ . dispatcher ( )
405
+ . send ( fetch_lineage_file_task, Some ( Span :: current ( ) ) )
406
+ . await
407
+ {
408
+ self . terminate_with_result ( Err ( GarbageCollectorError :: Channel ( e) ) , ctx)
409
+ . await ;
410
+ return ;
411
+ }
412
+ }
413
+ }
414
+ }
415
+
416
+ #[ async_trait]
417
+ impl Handler < TaskResult < FetchLineageFileOutput , FetchLineageFileError > >
418
+ for GarbageCollectorOrchestrator
419
+ {
420
+ type Result = ( ) ;
421
+
422
+ async fn handle (
423
+ & mut self ,
424
+ message : TaskResult < FetchLineageFileOutput , FetchLineageFileError > ,
425
+ ctx : & ComponentContext < GarbageCollectorOrchestrator > ,
426
+ ) {
427
+ let output = match self . ok_or_terminate ( message. into_inner ( ) , ctx) . await {
428
+ Some ( output) => output,
429
+ None => return ,
430
+ } ;
431
+
432
+ let mut collection_ids_in_tree = HashSet :: new ( ) ;
433
+ for dependency in output. 0 . dependencies {
434
+ // todo: no expect
435
+ let source_id = CollectionUuid :: from_str ( & dependency. source_collection_id )
436
+ . expect ( "Failed to parse source ID" ) ;
437
+ let target_id = CollectionUuid :: from_str ( & dependency. target_collection_id )
438
+ . expect ( "Failed to parse target ID" ) ;
439
+ collection_ids_in_tree. insert ( source_id) ;
440
+ collection_ids_in_tree. insert ( target_id) ;
441
+ }
442
+
443
+ // todo: need to remove self?
444
+
445
+ for collection_id in collection_ids_in_tree {
446
+ let fetch_version_file_task = wrap (
447
+ Box :: new ( FetchVersionFileOperator :: new ( ) ) ,
448
+ FetchVersionFileInput :: new ( collection_id, self . storage . clone ( ) ) ,
449
+ ctx. receiver ( ) ,
450
+ ) ;
451
+
452
+ if let Err ( e) = self
453
+ . dispatcher ( )
454
+ . send ( fetch_version_file_task, Some ( Span :: current ( ) ) )
455
+ . await
456
+ {
457
+ self . terminate_with_result ( Err ( GarbageCollectorError :: Channel ( e) ) , ctx)
458
+ . await ;
459
+ return ;
460
+ }
400
461
}
401
462
}
402
463
}
@@ -871,6 +932,7 @@ mod tests {
871
932
let orchestrator = GarbageCollectorOrchestrator :: new (
872
933
collection_id,
873
934
collection_info. version_file_path . clone ( ) ,
935
+ None , // todo
874
936
SystemTime :: now ( ) . into ( ) , // immediately expire versions
875
937
sysdb,
876
938
dispatcher_handle,
@@ -1027,6 +1089,7 @@ mod tests {
1027
1089
let orchestrator = GarbageCollectorOrchestrator :: new (
1028
1090
collection_id,
1029
1091
collection_info. version_file_path . clone ( ) ,
1092
+ None , // todo
1030
1093
SystemTime :: now ( ) . into ( ) , // immediately expire versions
1031
1094
sysdb,
1032
1095
dispatcher_handle,
@@ -1185,6 +1248,7 @@ mod tests {
1185
1248
let orchestrator = GarbageCollectorOrchestrator :: new (
1186
1249
collection_id,
1187
1250
collection_info. version_file_path . clone ( ) ,
1251
+ None , // todo
1188
1252
SystemTime :: now ( ) . into ( ) , // immediately expire versions
1189
1253
sysdb,
1190
1254
dispatcher_handle,
0 commit comments