@@ -28,9 +28,8 @@ use std::{
28
28
} ;
29
29
30
30
use arrow_array:: RecordBatch ;
31
- use arrow_ipc:: writer:: StreamWriter ;
32
31
use arrow_schema:: { Field , Fields , Schema } ;
33
- use chrono:: { NaiveDateTime , Timelike } ;
32
+ use chrono:: { NaiveDateTime , Timelike , Utc } ;
34
33
use derive_more:: { Deref , DerefMut } ;
35
34
use itertools:: Itertools ;
36
35
use parquet:: {
@@ -55,14 +54,14 @@ use crate::{
55
54
metrics,
56
55
option:: Mode ,
57
56
storage:: { object_storage:: to_bytes, retention:: Retention , StreamType } ,
58
- utils:: time:: Minute ,
57
+ utils:: time:: { Minute , TimeRange } ,
59
58
LOCK_EXPECT , OBJECT_STORE_DATA_GRANULARITY ,
60
59
} ;
61
60
62
61
use super :: {
63
62
staging:: {
64
63
reader:: { MergedRecordReader , MergedReverseRecordReader } ,
65
- writer:: Writer ,
64
+ writer:: { DiskWriter , Writer } ,
66
65
StagingError ,
67
66
} ,
68
67
LogStream , ARROW_FILE_EXTENSION ,
@@ -123,29 +122,26 @@ impl Stream {
123
122
) -> Result < ( ) , StagingError > {
124
123
let mut guard = self . writer . lock ( ) . unwrap ( ) ;
125
124
if self . options . mode != Mode :: Query || stream_type == StreamType :: Internal {
126
- match guard. disk . get_mut ( schema_key) {
125
+ let filename =
126
+ self . filename_by_partition ( schema_key, parsed_timestamp, custom_partition_values) ;
127
+ match guard. disk . get_mut ( & filename) {
127
128
Some ( writer) => {
128
129
writer. write ( record) ?;
129
130
}
130
131
None => {
131
132
// entry is not present thus we create it
132
- let file_path = self . path_by_current_time (
133
- schema_key,
134
- parsed_timestamp,
135
- custom_partition_values,
136
- ) ;
137
133
std:: fs:: create_dir_all ( & self . data_path ) ?;
138
134
139
- let file = OpenOptions :: new ( )
140
- . create ( true )
141
- . append ( true )
142
- . open ( & file_path ) ? ;
143
-
144
- let mut writer = StreamWriter :: try_new ( file , & record. schema ( ) )
135
+ let range = TimeRange :: granularity_range (
136
+ parsed_timestamp . and_local_timezone ( Utc ) . unwrap ( ) ,
137
+ OBJECT_STORE_DATA_GRANULARITY ,
138
+ ) ;
139
+ let file_path = self . data_path . join ( & filename ) ;
140
+ let mut writer = DiskWriter :: try_new ( file_path , & record. schema ( ) , range )
145
141
. expect ( "File and RecordBatch both are checked" ) ;
146
142
147
143
writer. write ( record) ?;
148
- guard. disk . insert ( schema_key . to_owned ( ) , writer) ;
144
+ guard. disk . insert ( filename , writer) ;
149
145
}
150
146
} ;
151
147
}
@@ -155,17 +151,17 @@ impl Stream {
155
151
Ok ( ( ) )
156
152
}
157
153
158
- pub fn path_by_current_time (
154
+ pub fn filename_by_partition (
159
155
& self ,
160
156
stream_hash : & str ,
161
157
parsed_timestamp : NaiveDateTime ,
162
158
custom_partition_values : & HashMap < String , String > ,
163
- ) -> PathBuf {
159
+ ) -> String {
164
160
let mut hostname = hostname:: get ( ) . unwrap ( ) . into_string ( ) . unwrap ( ) ;
165
161
if let Some ( id) = & self . ingestor_id {
166
162
hostname. push_str ( id) ;
167
163
}
168
- let filename = format ! (
164
+ format ! (
169
165
"{stream_hash}.date={}.hour={:02}.minute={}.{}{hostname}.data.{ARROW_FILE_EXTENSION}" ,
170
166
parsed_timestamp. date( ) ,
171
167
parsed_timestamp. hour( ) ,
@@ -175,8 +171,7 @@ impl Stream {
175
171
. sorted_by_key( |v| v. 0 )
176
172
. map( |( key, value) | format!( "{key}={value}." ) )
177
173
. join( "" )
178
- ) ;
179
- self . data_path . join ( filename)
174
+ )
180
175
}
181
176
182
177
pub fn arrow_files ( & self ) -> Vec < PathBuf > {
@@ -366,19 +361,12 @@ impl Stream {
366
361
self . writer . lock ( ) . unwrap ( ) . mem . clear ( ) ;
367
362
}
368
363
369
- pub fn flush ( & self ) {
370
- let mut disk_writers = {
371
- let mut writer = self . writer . lock ( ) . unwrap ( ) ;
372
- // Flush memory
373
- writer. mem . clear ( ) ;
374
- // Take schema -> disk writer mapping
375
- std:: mem:: take ( & mut writer. disk )
376
- } ;
377
-
378
- // Flush disk
379
- for writer in disk_writers. values_mut ( ) {
380
- _ = writer. finish ( ) ;
381
- }
364
+ pub fn flush ( & self , forced : bool ) {
365
+ let mut writer = self . writer . lock ( ) . unwrap ( ) ;
366
+ // Flush memory
367
+ writer. mem . clear ( ) ;
368
+ // Drop schema -> disk writer mapping, triggers flush to disk
369
+ writer. disk . retain ( |_, w| !forced && w. is_current ( ) ) ;
382
370
}
383
371
384
372
fn parquet_writer_props (
@@ -733,7 +721,7 @@ impl Stream {
733
721
734
722
/// First flushes arrows onto disk and then converts the arrow into parquet
735
723
pub fn flush_and_convert ( & self , shutdown_signal : bool ) -> Result < ( ) , StagingError > {
736
- self . flush ( ) ;
724
+ self . flush ( shutdown_signal ) ;
737
725
738
726
self . prepare_parquet ( shutdown_signal)
739
727
}
@@ -944,18 +932,18 @@ mod tests {
944
932
None ,
945
933
) ;
946
934
947
- let expected_path = staging . data_path . join ( format ! (
935
+ let expected = format ! (
948
936
"{stream_hash}.date={}.hour={:02}.minute={}.{}.data.{ARROW_FILE_EXTENSION}" ,
949
937
parsed_timestamp. date( ) ,
950
938
parsed_timestamp. hour( ) ,
951
939
Minute :: from( parsed_timestamp) . to_slot( OBJECT_STORE_DATA_GRANULARITY ) ,
952
940
hostname:: get( ) . unwrap( ) . into_string( ) . unwrap( )
953
- ) ) ;
941
+ ) ;
954
942
955
- let generated_path =
956
- staging. path_by_current_time ( stream_hash, parsed_timestamp, & custom_partition_values) ;
943
+ let generated =
944
+ staging. filename_by_partition ( stream_hash, parsed_timestamp, & custom_partition_values) ;
957
945
958
- assert_eq ! ( generated_path , expected_path ) ;
946
+ assert_eq ! ( generated , expected ) ;
959
947
}
960
948
961
949
#[ test]
@@ -978,18 +966,18 @@ mod tests {
978
966
None ,
979
967
) ;
980
968
981
- let expected_path = staging . data_path . join ( format ! (
969
+ let expected = format ! (
982
970
"{stream_hash}.date={}.hour={:02}.minute={}.key1=value1.key2=value2.{}.data.{ARROW_FILE_EXTENSION}" ,
983
971
parsed_timestamp. date( ) ,
984
972
parsed_timestamp. hour( ) ,
985
973
Minute :: from( parsed_timestamp) . to_slot( OBJECT_STORE_DATA_GRANULARITY ) ,
986
974
hostname:: get( ) . unwrap( ) . into_string( ) . unwrap( )
987
- ) ) ;
975
+ ) ;
988
976
989
- let generated_path =
990
- staging. path_by_current_time ( stream_hash, parsed_timestamp, & custom_partition_values) ;
977
+ let generated =
978
+ staging. filename_by_partition ( stream_hash, parsed_timestamp, & custom_partition_values) ;
991
979
992
- assert_eq ! ( generated_path , expected_path ) ;
980
+ assert_eq ! ( generated , expected ) ;
993
981
}
994
982
995
983
#[ test]
@@ -1045,7 +1033,7 @@ mod tests {
1045
1033
StreamType :: UserDefined ,
1046
1034
)
1047
1035
. unwrap ( ) ;
1048
- staging. flush ( ) ;
1036
+ staging. flush ( true ) ;
1049
1037
}
1050
1038
1051
1039
#[ test]
0 commit comments