@@ -37,6 +37,45 @@ pub enum TimestampPrecision {
3737 Nanosecond ,
3838}
3939
40+ /// Builder for configuring Arrow schema conversion options.
41+ #[ derive( Debug , Clone ) ]
42+ pub struct ArrowSchemaOptions {
43+ timestamp_precision : TimestampPrecision ,
44+ }
45+
46+ impl Default for ArrowSchemaOptions {
47+ fn default ( ) -> Self {
48+ Self :: new ( )
49+ }
50+ }
51+
52+ impl ArrowSchemaOptions {
53+ /// Create a new options builder with default values.
54+ /// - Timestamp precision is [`TimestampPrecision::Nanosecond`]
55+ pub fn new ( ) -> Self {
56+ Self {
57+ timestamp_precision : TimestampPrecision :: default ( ) ,
58+ }
59+ }
60+
61+ /// Set the timestamp precision for converting ORC timestamps to Arrow.
62+ ///
63+ /// ORC timestamps have nanosecond precision, but you may want to convert
64+ /// them to microseconds for compatibility with systems that don't support
65+ /// nanosecond precision.
66+ ///
67+ /// Default: [`TimestampPrecision::Nanosecond`]
68+ pub fn with_timestamp_precision ( mut self , precision : TimestampPrecision ) -> Self {
69+ self . timestamp_precision = precision;
70+ self
71+ }
72+
73+ /// Get the timestamp precision
74+ fn timestamp_precision ( & self ) -> TimestampPrecision {
75+ self . timestamp_precision
76+ }
77+ }
78+
4079/// Represents the root data type of the ORC file. Contains multiple named child types
4180/// which map to the columns available. Allows projecting only specific columns from
4281/// the base schema.
@@ -73,22 +112,22 @@ impl RootDataType {
73112
74113 /// Convert into an Arrow schema.
75114 pub fn create_arrow_schema ( & self , user_metadata : & HashMap < String , String > ) -> Schema {
76- self . create_arrow_schema_with_options ( user_metadata, TimestampPrecision :: default ( ) )
115+ self . create_arrow_schema_with_options ( user_metadata, ArrowSchemaOptions :: new ( ) )
77116 }
78117
79- /// Convert into an Arrow schema with specified timestamp precision .
118+ /// Convert into an Arrow schema with custom options .
80119 pub fn create_arrow_schema_with_options (
81120 & self ,
82121 user_metadata : & HashMap < String , String > ,
83- timestamp_precision : TimestampPrecision ,
122+ options : ArrowSchemaOptions ,
84123 ) -> Schema {
85124 let fields = self
86125 . children
87126 . iter ( )
88127 . map ( |col| {
89128 let dt = col
90129 . data_type ( )
91- . to_arrow_data_type_with_options ( timestamp_precision ) ;
130+ . to_arrow_data_type_with_options ( options . clone ( ) ) ;
92131 Field :: new ( col. name ( ) , dt, true )
93132 } )
94133 . collect :: < Vec < _ > > ( ) ;
@@ -455,14 +494,14 @@ impl DataType {
455494 Ok ( dt)
456495 }
457496
497+ /// Convert this ORC data type to an Arrow data type with default options.
458498 pub fn to_arrow_data_type ( & self ) -> ArrowDataType {
459- self . to_arrow_data_type_with_options ( TimestampPrecision :: default ( ) )
499+ self . to_arrow_data_type_with_options ( ArrowSchemaOptions :: new ( ) )
460500 }
461501
462- pub fn to_arrow_data_type_with_options (
463- & self ,
464- timestamp_precision : TimestampPrecision ,
465- ) -> ArrowDataType {
502+ /// Convert this ORC data type to an Arrow data type with custom options.
503+ pub fn to_arrow_data_type_with_options ( & self , options : ArrowSchemaOptions ) -> ArrowDataType {
504+ let timestamp_precision = options. timestamp_precision ( ) ;
466505 let time_unit = match timestamp_precision {
467506 TimestampPrecision :: Microsecond => TimeUnit :: Microsecond ,
468507 TimestampPrecision :: Nanosecond => TimeUnit :: Nanosecond ,
@@ -494,24 +533,24 @@ impl DataType {
494533 . map ( |col| {
495534 let dt = col
496535 . data_type ( )
497- . to_arrow_data_type_with_options ( timestamp_precision ) ;
536+ . to_arrow_data_type_with_options ( options . clone ( ) ) ;
498537 Field :: new ( col. name ( ) , dt, true )
499538 } )
500539 . collect ( ) ;
501540 ArrowDataType :: Struct ( children)
502541 }
503542 DataType :: List { child, .. } => {
504- let child = child. to_arrow_data_type_with_options ( timestamp_precision ) ;
543+ let child = child. to_arrow_data_type_with_options ( options ) ;
505544 ArrowDataType :: new_list ( child, true )
506545 }
507546 DataType :: Map { key, value, .. } => {
508547 // TODO: this needs to be kept in sync with MapArrayDecoder
509548 // move to common location?
510549 // TODO: should it be "keys" and "values" (like arrow-rs)
511550 // or "key" and "value" like PyArrow and in Schema.fbs?
512- let key = key. to_arrow_data_type_with_options ( timestamp_precision ) ;
551+ let key = key. to_arrow_data_type_with_options ( options . clone ( ) ) ;
513552 let key = Field :: new ( "keys" , key, false ) ;
514- let value = value. to_arrow_data_type_with_options ( timestamp_precision ) ;
553+ let value = value. to_arrow_data_type_with_options ( options ) ;
515554 let value = Field :: new ( "values" , value, true ) ;
516555
517556 let dt = ArrowDataType :: Struct ( vec ! [ key, value] . into ( ) ) ;
@@ -527,7 +566,7 @@ impl DataType {
527566 // TODO: Support up to including 256
528567 // Need to do Union within Union
529568 let index = index as u8 as i8 ;
530- let arrow_dt = variant. to_arrow_data_type_with_options ( timestamp_precision ) ;
569+ let arrow_dt = variant. to_arrow_data_type_with_options ( options . clone ( ) ) ;
531570 // Name shouldn't matter here (only ORC struct types give names to subtypes anyway)
532571 // Using naming convention following PyArrow for easier comparison
533572 let field = Arc :: new ( Field :: new ( format ! ( "_union_{index}" ) , arrow_dt, true ) ) ;
0 commit comments