Skip to content

Commit 8dcdf74

Browse files
committed
builder
1 parent 867d5b8 commit 8dcdf74

3 files changed

Lines changed: 58 additions & 20 deletions

File tree

src/arrow_reader.rs

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ use crate::projection::ProjectionMask;
2929
use crate::reader::metadata::{read_metadata, FileMetadata};
3030
use crate::reader::ChunkReader;
3131
use crate::row_selection::RowSelection;
32-
use crate::schema::{RootDataType, TimestampPrecision};
32+
use crate::schema::{ArrowSchemaOptions, RootDataType, TimestampPrecision};
3333
use crate::stripe::{Stripe, StripeMetadata};
3434

3535
const DEFAULT_BATCH_SIZE: usize = 8192;
@@ -149,10 +149,9 @@ impl<R> ArrowReaderBuilder<R> {
149149
.map(|(key, value)| (key.clone(), String::from_utf8_lossy(value).to_string()))
150150
.collect::<HashMap<_, _>>();
151151
self.schema_ref.clone().unwrap_or_else(|| {
152-
Arc::new(
153-
projected_data_type
154-
.create_arrow_schema_with_options(&metadata, self.timestamp_precision),
155-
)
152+
let options =
153+
ArrowSchemaOptions::new().with_timestamp_precision(self.timestamp_precision);
154+
Arc::new(projected_data_type.create_arrow_schema_with_options(&metadata, options))
156155
})
157156
}
158157
}

src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,4 +72,4 @@ pub use arrow_writer::{ArrowWriter, ArrowWriterBuilder};
7272
#[cfg(feature = "async")]
7373
pub use async_arrow_reader::ArrowStreamReader;
7474
pub use row_selection::{RowSelection, RowSelector};
75-
pub use schema::TimestampPrecision;
75+
pub use schema::{ArrowSchemaOptions, TimestampPrecision};

src/schema.rs

Lines changed: 53 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,45 @@ pub enum TimestampPrecision {
3737
Nanosecond,
3838
}
3939

40+
/// Builder for configuring Arrow schema conversion options.
41+
#[derive(Debug, Clone)]
42+
pub struct ArrowSchemaOptions {
43+
timestamp_precision: TimestampPrecision,
44+
}
45+
46+
impl Default for ArrowSchemaOptions {
47+
fn default() -> Self {
48+
Self::new()
49+
}
50+
}
51+
52+
impl ArrowSchemaOptions {
53+
/// Create a new options builder with default values.
54+
/// - Timestamp precision is [`TimestampPrecision::Nanosecond`]
55+
pub fn new() -> Self {
56+
Self {
57+
timestamp_precision: TimestampPrecision::default(),
58+
}
59+
}
60+
61+
/// Set the timestamp precision for converting ORC timestamps to Arrow.
62+
///
63+
/// ORC timestamps have nanosecond precision, but you may want to convert
64+
/// them to microseconds for compatibility with systems that don't support
65+
/// nanosecond precision.
66+
///
67+
/// Default: [`TimestampPrecision::Nanosecond`]
68+
pub fn with_timestamp_precision(mut self, precision: TimestampPrecision) -> Self {
69+
self.timestamp_precision = precision;
70+
self
71+
}
72+
73+
/// Get the timestamp precision
74+
fn timestamp_precision(&self) -> TimestampPrecision {
75+
self.timestamp_precision
76+
}
77+
}
78+
4079
/// Represents the root data type of the ORC file. Contains multiple named child types
4180
/// which map to the columns available. Allows projecting only specific columns from
4281
/// the base schema.
@@ -73,22 +112,22 @@ impl RootDataType {
73112

74113
/// Convert into an Arrow schema.
75114
pub fn create_arrow_schema(&self, user_metadata: &HashMap<String, String>) -> Schema {
76-
self.create_arrow_schema_with_options(user_metadata, TimestampPrecision::default())
115+
self.create_arrow_schema_with_options(user_metadata, ArrowSchemaOptions::new())
77116
}
78117

79-
/// Convert into an Arrow schema with specified timestamp precision.
118+
/// Convert into an Arrow schema with custom options.
80119
pub fn create_arrow_schema_with_options(
81120
&self,
82121
user_metadata: &HashMap<String, String>,
83-
timestamp_precision: TimestampPrecision,
122+
options: ArrowSchemaOptions,
84123
) -> Schema {
85124
let fields = self
86125
.children
87126
.iter()
88127
.map(|col| {
89128
let dt = col
90129
.data_type()
91-
.to_arrow_data_type_with_options(timestamp_precision);
130+
.to_arrow_data_type_with_options(options.clone());
92131
Field::new(col.name(), dt, true)
93132
})
94133
.collect::<Vec<_>>();
@@ -455,14 +494,14 @@ impl DataType {
455494
Ok(dt)
456495
}
457496

497+
/// Convert this ORC data type to an Arrow data type with default options.
458498
pub fn to_arrow_data_type(&self) -> ArrowDataType {
459-
self.to_arrow_data_type_with_options(TimestampPrecision::default())
499+
self.to_arrow_data_type_with_options(ArrowSchemaOptions::new())
460500
}
461501

462-
pub fn to_arrow_data_type_with_options(
463-
&self,
464-
timestamp_precision: TimestampPrecision,
465-
) -> ArrowDataType {
502+
/// Convert this ORC data type to an Arrow data type with custom options.
503+
pub fn to_arrow_data_type_with_options(&self, options: ArrowSchemaOptions) -> ArrowDataType {
504+
let timestamp_precision = options.timestamp_precision();
466505
let time_unit = match timestamp_precision {
467506
TimestampPrecision::Microsecond => TimeUnit::Microsecond,
468507
TimestampPrecision::Nanosecond => TimeUnit::Nanosecond,
@@ -494,24 +533,24 @@ impl DataType {
494533
.map(|col| {
495534
let dt = col
496535
.data_type()
497-
.to_arrow_data_type_with_options(timestamp_precision);
536+
.to_arrow_data_type_with_options(options.clone());
498537
Field::new(col.name(), dt, true)
499538
})
500539
.collect();
501540
ArrowDataType::Struct(children)
502541
}
503542
DataType::List { child, .. } => {
504-
let child = child.to_arrow_data_type_with_options(timestamp_precision);
543+
let child = child.to_arrow_data_type_with_options(options);
505544
ArrowDataType::new_list(child, true)
506545
}
507546
DataType::Map { key, value, .. } => {
508547
// TODO: this needs to be kept in sync with MapArrayDecoder
509548
// move to common location?
510549
// TODO: should it be "keys" and "values" (like arrow-rs)
511550
// or "key" and "value" like PyArrow and in Schema.fbs?
512-
let key = key.to_arrow_data_type_with_options(timestamp_precision);
551+
let key = key.to_arrow_data_type_with_options(options.clone());
513552
let key = Field::new("keys", key, false);
514-
let value = value.to_arrow_data_type_with_options(timestamp_precision);
553+
let value = value.to_arrow_data_type_with_options(options);
515554
let value = Field::new("values", value, true);
516555

517556
let dt = ArrowDataType::Struct(vec![key, value].into());
@@ -527,7 +566,7 @@ impl DataType {
527566
// TODO: Support up to including 256
528567
// Need to do Union within Union
529568
let index = index as u8 as i8;
530-
let arrow_dt = variant.to_arrow_data_type_with_options(timestamp_precision);
569+
let arrow_dt = variant.to_arrow_data_type_with_options(options.clone());
531570
// Name shouldn't matter here (only ORC struct types give names to subtypes anyway)
532571
// Using naming convention following PyArrow for easier comparison
533572
let field = Arc::new(Field::new(format!("_union_{index}"), arrow_dt, true));

0 commit comments

Comments
 (0)