@@ -23,7 +23,7 @@ use anyhow::anyhow;
23
23
use arrow_array:: RecordBatch ;
24
24
use arrow_json:: reader:: { infer_json_schema_from_iterator, ReaderBuilder } ;
25
25
use arrow_schema:: { DataType , Field , Fields , Schema } ;
26
- use chrono:: { DateTime , NaiveDateTime , Utc } ;
26
+ use chrono:: { DateTime , NaiveDate , NaiveDateTime , Utc } ;
27
27
use datafusion:: arrow:: util:: bit_util:: round_upto_multiple_of_64;
28
28
use itertools:: Itertools ;
29
29
use serde_json:: Value ;
@@ -62,6 +62,7 @@ impl EventFormat for Event {
62
62
schema : & HashMap < String , Arc < Field > > ,
63
63
time_partition : Option < & String > ,
64
64
schema_version : SchemaVersion ,
65
+ static_schema_flag : bool ,
65
66
) -> Result < ( Self :: Data , Vec < Arc < Field > > , bool ) , anyhow:: Error > {
66
67
let stream_schema = schema;
67
68
@@ -111,7 +112,7 @@ impl EventFormat for Event {
111
112
112
113
if value_arr
113
114
. iter ( )
114
- . any ( |value| fields_mismatch ( & schema, value, schema_version) )
115
+ . any ( |value| fields_mismatch ( & schema, value, schema_version, static_schema_flag ) )
115
116
{
116
117
return Err ( anyhow ! (
117
118
"Could not process this event due to mismatch in datatype"
@@ -253,73 +254,131 @@ fn collect_keys<'a>(values: impl Iterator<Item = &'a Value>) -> Result<Vec<&'a s
253
254
Ok ( keys)
254
255
}
255
256
256
- fn fields_mismatch ( schema : & [ Arc < Field > ] , body : & Value , schema_version : SchemaVersion ) -> bool {
257
+ fn fields_mismatch (
258
+ schema : & [ Arc < Field > ] ,
259
+ body : & Value ,
260
+ schema_version : SchemaVersion ,
261
+ static_schema_flag : bool ,
262
+ ) -> bool {
257
263
for ( name, val) in body. as_object ( ) . expect ( "body is of object variant" ) {
258
264
if val. is_null ( ) {
259
265
continue ;
260
266
}
261
267
let Some ( field) = get_field ( schema, name) else {
262
268
return true ;
263
269
} ;
264
- if !valid_type ( field. data_type ( ) , val, schema_version) {
270
+ if !valid_type ( field, val, schema_version, static_schema_flag ) {
265
271
return true ;
266
272
}
267
273
}
268
274
false
269
275
}
270
276
271
- fn valid_type ( data_type : & DataType , value : & Value , schema_version : SchemaVersion ) -> bool {
272
- match data_type {
277
+ fn valid_type (
278
+ field : & Field ,
279
+ value : & Value ,
280
+ schema_version : SchemaVersion ,
281
+ static_schema_flag : bool ,
282
+ ) -> bool {
283
+ match field. data_type ( ) {
273
284
DataType :: Boolean => value. is_boolean ( ) ,
274
- DataType :: Int8 | DataType :: Int16 | DataType :: Int32 | DataType :: Int64 => value. is_i64 ( ) ,
285
+ DataType :: Int8 | DataType :: Int16 | DataType :: Int32 | DataType :: Int64 => {
286
+ validate_int ( value, static_schema_flag)
287
+ }
275
288
DataType :: UInt8 | DataType :: UInt16 | DataType :: UInt32 | DataType :: UInt64 => value. is_u64 ( ) ,
276
289
DataType :: Float16 | DataType :: Float32 => value. is_f64 ( ) ,
277
- // All numbers can be cast as Float64 from schema version v1
278
- DataType :: Float64 if schema_version == SchemaVersion :: V1 => value. is_number ( ) ,
279
- DataType :: Float64 if schema_version != SchemaVersion :: V1 => value. is_f64 ( ) ,
290
+ DataType :: Float64 => validate_float ( value, schema_version, static_schema_flag) ,
280
291
DataType :: Utf8 => value. is_string ( ) ,
281
- DataType :: List ( field) => {
282
- let data_type = field. data_type ( ) ;
283
- if let Value :: Array ( arr) = value {
284
- for elem in arr {
285
- if elem. is_null ( ) {
286
- continue ;
287
- }
288
- if !valid_type ( data_type, elem, schema_version) {
289
- return false ;
290
- }
291
- }
292
- }
293
- true
294
- }
292
+ DataType :: List ( field) => validate_list ( field, value, schema_version, static_schema_flag) ,
295
293
DataType :: Struct ( fields) => {
296
- if let Value :: Object ( val) = value {
297
- for ( key, value) in val {
298
- let field = ( 0 ..fields. len ( ) )
299
- . find ( |idx| fields[ * idx] . name ( ) == key)
300
- . map ( |idx| & fields[ idx] ) ;
301
-
302
- if let Some ( field) = field {
303
- if value. is_null ( ) {
304
- continue ;
305
- }
306
- if !valid_type ( field. data_type ( ) , value, schema_version) {
307
- return false ;
308
- }
309
- } else {
310
- return false ;
311
- }
312
- }
313
- true
314
- } else {
315
- false
294
+ validate_struct ( fields, value, schema_version, static_schema_flag)
295
+ }
296
+ DataType :: Date32 => {
297
+ if let Value :: String ( s) = value {
298
+ return NaiveDate :: parse_from_str ( s, "%Y-%m-%d" ) . is_ok ( ) ;
316
299
}
300
+ false
317
301
}
318
302
DataType :: Timestamp ( _, _) => value. is_string ( ) || value. is_number ( ) ,
319
303
_ => {
320
- error ! ( "Unsupported datatype {:?}, value {:?}" , data_type, value) ;
321
- unreachable ! ( )
304
+ error ! (
305
+ "Unsupported datatype {:?}, value {:?}" ,
306
+ field. data_type( ) ,
307
+ value
308
+ ) ;
309
+ false
310
+ }
311
+ }
312
+ }
313
+
314
+ fn validate_int ( value : & Value , static_schema_flag : bool ) -> bool {
315
+ // allow casting string to int for static schema
316
+ if static_schema_flag {
317
+ if let Value :: String ( s) = value {
318
+ return s. trim ( ) . parse :: < i64 > ( ) . is_ok ( ) ;
319
+ }
320
+ }
321
+ value. is_i64 ( )
322
+ }
323
+
324
+ fn validate_float ( value : & Value , schema_version : SchemaVersion , static_schema_flag : bool ) -> bool {
325
+ // allow casting string to int for static schema
326
+ if static_schema_flag {
327
+ if let Value :: String ( s) = value. clone ( ) {
328
+ let trimmed = s. trim ( ) ;
329
+ return trimmed. parse :: < f64 > ( ) . is_ok ( ) || trimmed. parse :: < i64 > ( ) . is_ok ( ) ;
330
+ }
331
+ return value. is_number ( ) ;
332
+ }
333
+ match schema_version {
334
+ SchemaVersion :: V1 => value. is_number ( ) ,
335
+ _ => value. is_f64 ( ) ,
336
+ }
337
+ }
338
+
339
+ fn validate_list (
340
+ field : & Field ,
341
+ value : & Value ,
342
+ schema_version : SchemaVersion ,
343
+ static_schema_flag : bool ,
344
+ ) -> bool {
345
+ if let Value :: Array ( arr) = value {
346
+ for elem in arr {
347
+ if elem. is_null ( ) {
348
+ continue ;
349
+ }
350
+ if !valid_type ( field, elem, schema_version, static_schema_flag) {
351
+ return false ;
352
+ }
353
+ }
354
+ }
355
+ true
356
+ }
357
+
358
+ fn validate_struct (
359
+ fields : & Fields ,
360
+ value : & Value ,
361
+ schema_version : SchemaVersion ,
362
+ static_schema_flag : bool ,
363
+ ) -> bool {
364
+ if let Value :: Object ( val) = value {
365
+ for ( key, value) in val {
366
+ let field = fields. iter ( ) . find ( |f| f. name ( ) == key) ;
367
+
368
+ if let Some ( field) = field {
369
+ if value. is_null ( ) {
370
+ continue ;
371
+ }
372
+ if !valid_type ( field, value, schema_version, static_schema_flag) {
373
+ return false ;
374
+ }
375
+ } else {
376
+ return false ;
377
+ }
322
378
}
379
+ true
380
+ } else {
381
+ false
323
382
}
324
383
}
325
384
0 commit comments