@@ -23,22 +23,38 @@ use anyhow::anyhow;
23
23
use arrow_array:: RecordBatch ;
24
24
use arrow_json:: reader:: { infer_json_schema_from_iterator, ReaderBuilder } ;
25
25
use arrow_schema:: { DataType , Field , Fields , Schema } ;
26
+ use chrono:: { DateTime , NaiveDateTime , Utc } ;
26
27
use datafusion:: arrow:: util:: bit_util:: round_upto_multiple_of_64;
27
28
use itertools:: Itertools ;
28
29
use serde_json:: Value ;
29
30
use std:: { collections:: HashMap , sync:: Arc } ;
30
31
use tracing:: error;
31
32
32
33
use super :: EventFormat ;
33
- use crate :: { metadata:: SchemaVersion , utils:: arrow:: get_field} ;
34
+ use crate :: { metadata:: SchemaVersion , storage :: StreamType , utils:: arrow:: get_field} ;
34
35
35
36
pub struct Event {
36
- pub data : Value ,
37
+ pub json : Value ,
38
+ pub p_timestamp : DateTime < Utc > ,
39
+ }
40
+
41
+ impl Event {
42
+ pub fn new ( json : Value ) -> Self {
43
+ Self {
44
+ json,
45
+ p_timestamp : Utc :: now ( ) ,
46
+ }
47
+ }
37
48
}
38
49
39
50
impl EventFormat for Event {
40
51
type Data = Vec < Value > ;
41
52
53
+ /// Returns the time at ingestion, i.e. the `p_timestamp` value
54
+ fn get_p_timestamp ( & self ) -> DateTime < Utc > {
55
+ self . p_timestamp
56
+ }
57
+
42
58
// convert the incoming json to a vector of json values
43
59
// also extract the arrow schema, tags and metadata from the incoming json
44
60
fn to_data (
@@ -52,7 +68,7 @@ impl EventFormat for Event {
52
68
// incoming event may be a single json or a json array
53
69
// but Data (type defined above) is a vector of json values
54
70
// hence we need to convert the incoming event to a vector of json values
55
- let value_arr = match self . data {
71
+ let value_arr = match self . json {
56
72
Value :: Array ( arr) => arr,
57
73
value @ Value :: Object ( _) => vec ! [ value] ,
58
74
_ => unreachable ! ( "flatten would have failed beforehand" ) ,
@@ -120,6 +136,87 @@ impl EventFormat for Event {
120
136
Ok ( None ) => unreachable ! ( "all records are added to one rb" ) ,
121
137
}
122
138
}
139
+
140
+ /// Converts a JSON event into a Parseable Event
141
+ fn into_event (
142
+ self ,
143
+ stream_name : String ,
144
+ origin_size : u64 ,
145
+ storage_schema : & HashMap < String , Arc < Field > > ,
146
+ static_schema_flag : bool ,
147
+ custom_partitions : Option < & String > ,
148
+ time_partition : Option < & String > ,
149
+ schema_version : SchemaVersion ,
150
+ stream_type : StreamType ,
151
+ ) -> Result < super :: Event , anyhow:: Error > {
152
+ let custom_partition_values = match custom_partitions. as_ref ( ) {
153
+ Some ( custom_partition) => {
154
+ let custom_partitions = custom_partition. split ( ',' ) . collect_vec ( ) ;
155
+ extract_custom_partition_values ( & self . json , & custom_partitions)
156
+ }
157
+ None => HashMap :: new ( ) ,
158
+ } ;
159
+
160
+ let parsed_timestamp = match time_partition {
161
+ Some ( time_partition) => extract_and_parse_time ( & self . json , time_partition) ?,
162
+ _ => self . p_timestamp . naive_utc ( ) ,
163
+ } ;
164
+
165
+ let ( rb, is_first_event) = self . into_recordbatch (
166
+ storage_schema,
167
+ static_schema_flag,
168
+ time_partition,
169
+ schema_version,
170
+ ) ?;
171
+
172
+ Ok ( super :: Event {
173
+ rb,
174
+ stream_name,
175
+ origin_format : "json" ,
176
+ origin_size,
177
+ is_first_event,
178
+ parsed_timestamp,
179
+ time_partition : None ,
180
+ custom_partition_values,
181
+ stream_type,
182
+ } )
183
+ }
184
+ }
185
+
186
+ /// Extracts custom partition values from provided JSON object
187
+ /// e.g. `json: {"status": 400, "msg": "Hello, World!"}, custom_partition_list: ["status"]` returns `{"status" => 400}`
188
+ pub fn extract_custom_partition_values (
189
+ json : & Value ,
190
+ custom_partition_list : & [ & str ] ,
191
+ ) -> HashMap < String , String > {
192
+ let mut custom_partition_values: HashMap < String , String > = HashMap :: new ( ) ;
193
+ for custom_partition_field in custom_partition_list {
194
+ let custom_partition_value = json. get ( custom_partition_field. trim ( ) ) . unwrap ( ) . to_owned ( ) ;
195
+ let custom_partition_value = match custom_partition_value {
196
+ e @ Value :: Number ( _) | e @ Value :: Bool ( _) => e. to_string ( ) ,
197
+ Value :: String ( s) => s,
198
+ _ => "" . to_string ( ) ,
199
+ } ;
200
+ custom_partition_values. insert (
201
+ custom_partition_field. trim ( ) . to_string ( ) ,
202
+ custom_partition_value,
203
+ ) ;
204
+ }
205
+ custom_partition_values
206
+ }
207
+
208
+ /// Returns the parsed timestamp of deignated time partition from json object
209
+ /// e.g. `json: {"timestamp": "2025-05-15T15:30:00Z"}` returns `2025-05-15T15:30:00`
210
+ fn extract_and_parse_time (
211
+ json : & Value ,
212
+ time_partition : & str ,
213
+ ) -> Result < NaiveDateTime , anyhow:: Error > {
214
+ let current_time = json
215
+ . get ( time_partition)
216
+ . ok_or_else ( || anyhow ! ( "Missing field for time partition in json: {time_partition}" ) ) ?;
217
+ let parsed_time: DateTime < Utc > = serde_json:: from_value ( current_time. clone ( ) ) ?;
218
+
219
+ Ok ( parsed_time. naive_utc ( ) )
123
220
}
124
221
125
222
// Returns arrow schema with the fields that are present in the request body
@@ -225,3 +322,37 @@ fn valid_type(data_type: &DataType, value: &Value, schema_version: SchemaVersion
225
322
}
226
323
}
227
324
}
325
+
326
+ #[ cfg( test) ]
327
+ mod tests {
328
+ use std:: str:: FromStr ;
329
+
330
+ use serde_json:: json;
331
+
332
+ use super :: * ;
333
+
334
+ #[ test]
335
+ fn parse_time_parition_from_value ( ) {
336
+ let json = json ! ( { "timestamp" : "2025-05-15T15:30:00Z" } ) ;
337
+ let parsed = extract_and_parse_time ( & json, "timestamp" ) ;
338
+
339
+ let expected = NaiveDateTime :: from_str ( "2025-05-15T15:30:00" ) . unwrap ( ) ;
340
+ assert_eq ! ( parsed. unwrap( ) , expected) ;
341
+ }
342
+
343
+ #[ test]
344
+ fn time_parition_not_in_json ( ) {
345
+ let json = json ! ( { "hello" : "world!" } ) ;
346
+ let parsed = extract_and_parse_time ( & json, "timestamp" ) ;
347
+
348
+ assert ! ( parsed. is_err( ) ) ;
349
+ }
350
+
351
+ #[ test]
352
+ fn time_parition_not_parseable_as_datetime ( ) {
353
+ let json = json ! ( { "timestamp" : "not time" } ) ;
354
+ let parsed = extract_and_parse_time ( & json, "timestamp" ) ;
355
+
356
+ assert ! ( parsed. is_err( ) ) ;
357
+ }
358
+ }
0 commit comments