@@ -2,7 +2,7 @@ use crate::{
2
2
config:: FrontendConfig , executor:: Executor , types:: errors:: ValidationError ,
3
3
CollectionsWithSegmentsProvider ,
4
4
} ;
5
- use backon:: Retryable ;
5
+ use backon:: { ExponentialBuilder , Retryable } ;
6
6
use chroma_config:: { registry, Configurable } ;
7
7
use chroma_error:: { ChromaError , ErrorCodes } ;
8
8
use chroma_log:: { LocalCompactionManager , LocalCompactionManagerConfig , Log } ;
@@ -35,10 +35,10 @@ use chroma_types::{
35
35
} ;
36
36
use opentelemetry:: global;
37
37
use opentelemetry:: metrics:: Counter ;
38
- use std:: collections:: HashSet ;
39
38
use std:: sync:: atomic:: { AtomicUsize , Ordering } ;
40
39
use std:: sync:: Arc ;
41
40
use std:: time:: { SystemTime , UNIX_EPOCH } ;
41
+ use std:: { collections:: HashSet , time:: Duration } ;
42
42
43
43
use super :: utils:: to_records;
44
44
@@ -49,6 +49,9 @@ struct Metrics {
49
49
count_retries_counter : Counter < u64 > ,
50
50
query_retries_counter : Counter < u64 > ,
51
51
get_retries_counter : Counter < u64 > ,
52
+ add_retries_counter : Counter < u64 > ,
53
+ update_retries_counter : Counter < u64 > ,
54
+ upsert_retries_counter : Counter < u64 > ,
52
55
}
53
56
54
57
#[ derive( Clone , Debug ) ]
@@ -61,6 +64,7 @@ pub struct ServiceBasedFrontend {
61
64
max_batch_size : u32 ,
62
65
metrics : Arc < Metrics > ,
63
66
default_knn_index : KnnIndex ,
67
+ retries_builder : ExponentialBuilder ,
64
68
}
65
69
66
70
impl ServiceBasedFrontend {
@@ -78,14 +82,32 @@ impl ServiceBasedFrontend {
78
82
let delete_retries_counter = meter. u64_counter ( "delete_retries" ) . build ( ) ;
79
83
let count_retries_counter = meter. u64_counter ( "count_retries" ) . build ( ) ;
80
84
let query_retries_counter = meter. u64_counter ( "query_retries" ) . build ( ) ;
81
- let get_retries_counter = meter. u64_counter ( "query_retries" ) . build ( ) ;
85
+ let get_retries_counter = meter. u64_counter ( "get_retries" ) . build ( ) ;
86
+ let add_retries_counter = meter. u64_counter ( "add_retries" ) . build ( ) ;
87
+ let update_retries_counter = meter. u64_counter ( "update_retries" ) . build ( ) ;
88
+ let upsert_retries_counter = meter. u64_counter ( "upsert_retries" ) . build ( ) ;
82
89
let metrics = Arc :: new ( Metrics {
83
90
fork_retries_counter,
84
91
delete_retries_counter,
85
92
count_retries_counter,
86
93
query_retries_counter,
87
94
get_retries_counter,
95
+ add_retries_counter,
96
+ update_retries_counter,
97
+ upsert_retries_counter,
88
98
} ) ;
99
+ // factor: 2.0,
100
+ // min_delay_ms: 100,
101
+ // max_delay_ms: 5000,
102
+ // max_attempts: 5,
103
+ // jitter: true,
104
+ // TODO(Sanket): Ideally config for this.
105
+ let retries_builder = ExponentialBuilder :: default ( )
106
+ . with_max_times ( 5 )
107
+ . with_factor ( 2.0 )
108
+ . with_max_delay ( Duration :: from_millis ( 5000 ) )
109
+ . with_min_delay ( Duration :: from_millis ( 100 ) )
110
+ . with_jitter ( ) ;
89
111
ServiceBasedFrontend {
90
112
allow_reset,
91
113
executor,
@@ -95,6 +117,7 @@ impl ServiceBasedFrontend {
95
117
max_batch_size,
96
118
metrics,
97
119
default_knn_index,
120
+ retries_builder,
98
121
}
99
122
}
100
123
@@ -630,6 +653,14 @@ impl ServiceBasedFrontend {
630
653
res
631
654
}
632
655
656
+ pub async fn retryable_push_logs (
657
+ & mut self ,
658
+ collection_id : CollectionUuid ,
659
+ records : Vec < OperationRecord > ,
660
+ ) -> Result < ( ) , Box < dyn ChromaError > > {
661
+ self . log_client . push_logs ( collection_id, records) . await
662
+ }
663
+
633
664
pub async fn add (
634
665
& mut self ,
635
666
AddCollectionRecordsRequest {
@@ -656,16 +687,29 @@ impl ServiceBasedFrontend {
656
687
to_records ( ids, embeddings, documents, uris, metadatas, Operation :: Add )
657
688
. map_err ( |err| Box :: new ( err) as Box < dyn ChromaError > ) ?;
658
689
659
- self . log_client
660
- . push_logs ( collection_id, records)
661
- . await
662
- . map_err ( |err| {
663
- if err. code ( ) == ErrorCodes :: Unavailable {
664
- AddCollectionRecordsError :: Backoff
665
- } else {
666
- AddCollectionRecordsError :: Other ( Box :: new ( err) as _ )
690
+ let retries = Arc :: new ( AtomicUsize :: new ( 0 ) ) ;
691
+ let add_to_retry = || {
692
+ let mut self_clone = self . clone ( ) ;
693
+ let records_clone = records. clone ( ) ;
694
+ async move {
695
+ self_clone
696
+ . retryable_push_logs ( collection_id, records_clone)
697
+ . await
698
+ }
699
+ } ;
700
+ let res = add_to_retry
701
+ . retry ( self . retries_builder )
702
+ . when ( |e| matches ! ( e. code( ) , ErrorCodes :: AlreadyExists ) )
703
+ . notify ( |_, _| {
704
+ let retried = retries. fetch_add ( 1 , Ordering :: Relaxed ) ;
705
+ if retried > 0 {
706
+ tracing:: info!( "Retrying add() request for collection {}" , collection_id) ;
667
707
}
668
- } ) ?;
708
+ } )
709
+ . await ;
710
+ self . metrics
711
+ . add_retries_counter
712
+ . add ( retries. load ( Ordering :: Relaxed ) as u64 , & [ ] ) ;
669
713
670
714
// TODO: Submit event after the response is sent
671
715
MeterEvent :: CollectionWrite {
@@ -678,7 +722,16 @@ impl ServiceBasedFrontend {
678
722
. submit ( )
679
723
. await ;
680
724
681
- Ok ( AddCollectionRecordsResponse { } )
725
+ match res {
726
+ Ok ( ( ) ) => Ok ( AddCollectionRecordsResponse { } ) ,
727
+ Err ( e) => {
728
+ if e. code ( ) == ErrorCodes :: AlreadyExists {
729
+ Err ( AddCollectionRecordsError :: Backoff )
730
+ } else {
731
+ Err ( AddCollectionRecordsError :: Other ( Box :: new ( e) as _ ) )
732
+ }
733
+ }
734
+ }
682
735
}
683
736
684
737
pub async fn update (
@@ -711,16 +764,29 @@ impl ServiceBasedFrontend {
711
764
)
712
765
. map_err ( |err| Box :: new ( err) as Box < dyn ChromaError > ) ?;
713
766
714
- self . log_client
715
- . push_logs ( collection_id, records)
716
- . await
717
- . map_err ( |err| {
718
- if err. code ( ) == ErrorCodes :: Unavailable {
719
- UpdateCollectionRecordsError :: Backoff
720
- } else {
721
- UpdateCollectionRecordsError :: Other ( Box :: new ( err) as _ )
767
+ let retries = Arc :: new ( AtomicUsize :: new ( 0 ) ) ;
768
+ let add_to_retry = || {
769
+ let mut self_clone = self . clone ( ) ;
770
+ let records_clone = records. clone ( ) ;
771
+ async move {
772
+ self_clone
773
+ . retryable_push_logs ( collection_id, records_clone)
774
+ . await
775
+ }
776
+ } ;
777
+ let res = add_to_retry
778
+ . retry ( self . retries_builder )
779
+ . when ( |e| matches ! ( e. code( ) , ErrorCodes :: AlreadyExists ) )
780
+ . notify ( |_, _| {
781
+ let retried = retries. fetch_add ( 1 , Ordering :: Relaxed ) ;
782
+ if retried > 0 {
783
+ tracing:: info!( "Retrying update() request for collection {}" , collection_id) ;
722
784
}
723
- } ) ?;
785
+ } )
786
+ . await ;
787
+ self . metrics
788
+ . update_retries_counter
789
+ . add ( retries. load ( Ordering :: Relaxed ) as u64 , & [ ] ) ;
724
790
725
791
// TODO: Submit event after the response is sent
726
792
MeterEvent :: CollectionWrite {
@@ -733,7 +799,16 @@ impl ServiceBasedFrontend {
733
799
. submit ( )
734
800
. await ;
735
801
736
- Ok ( UpdateCollectionRecordsResponse { } )
802
+ match res {
803
+ Ok ( ( ) ) => Ok ( UpdateCollectionRecordsResponse { } ) ,
804
+ Err ( e) => {
805
+ if e. code ( ) == ErrorCodes :: AlreadyExists {
806
+ Err ( UpdateCollectionRecordsError :: Backoff )
807
+ } else {
808
+ Err ( UpdateCollectionRecordsError :: Other ( Box :: new ( e) as _ ) )
809
+ }
810
+ }
811
+ }
737
812
}
738
813
739
814
pub async fn upsert (
@@ -768,16 +843,29 @@ impl ServiceBasedFrontend {
768
843
)
769
844
. map_err ( |err| Box :: new ( err) as Box < dyn ChromaError > ) ?;
770
845
771
- self . log_client
772
- . push_logs ( collection_id, records)
773
- . await
774
- . map_err ( |err| {
775
- if err. code ( ) == ErrorCodes :: Unavailable {
776
- UpsertCollectionRecordsError :: Backoff
777
- } else {
778
- UpsertCollectionRecordsError :: Other ( Box :: new ( err) as _ )
846
+ let retries = Arc :: new ( AtomicUsize :: new ( 0 ) ) ;
847
+ let add_to_retry = || {
848
+ let mut self_clone = self . clone ( ) ;
849
+ let records_clone = records. clone ( ) ;
850
+ async move {
851
+ self_clone
852
+ . retryable_push_logs ( collection_id, records_clone)
853
+ . await
854
+ }
855
+ } ;
856
+ let res = add_to_retry
857
+ . retry ( self . retries_builder )
858
+ . when ( |e| matches ! ( e. code( ) , ErrorCodes :: AlreadyExists ) )
859
+ . notify ( |_, _| {
860
+ let retried = retries. fetch_add ( 1 , Ordering :: Relaxed ) ;
861
+ if retried > 0 {
862
+ tracing:: info!( "Retrying upsert() request for collection {}" , collection_id) ;
779
863
}
780
- } ) ?;
864
+ } )
865
+ . await ;
866
+ self . metrics
867
+ . upsert_retries_counter
868
+ . add ( retries. load ( Ordering :: Relaxed ) as u64 , & [ ] ) ;
781
869
782
870
// TODO: Submit event after the response is sent
783
871
MeterEvent :: CollectionWrite {
@@ -790,7 +878,16 @@ impl ServiceBasedFrontend {
790
878
. submit ( )
791
879
. await ;
792
880
793
- Ok ( UpsertCollectionRecordsResponse { } )
881
+ match res {
882
+ Ok ( ( ) ) => Ok ( UpsertCollectionRecordsResponse { } ) ,
883
+ Err ( e) => {
884
+ if e. code ( ) == ErrorCodes :: AlreadyExists {
885
+ Err ( UpsertCollectionRecordsError :: Backoff )
886
+ } else {
887
+ Err ( UpsertCollectionRecordsError :: Other ( Box :: new ( e) as _ ) )
888
+ }
889
+ }
890
+ }
794
891
}
795
892
796
893
pub async fn retryable_delete (
0 commit comments