diff --git a/Cargo.toml b/Cargo.toml index 7a09b50..3053ed7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,6 +8,7 @@ resolver = "2" [workspace.dependencies] arrow-array = "52" arrow-buffer = "52" +arrow-cast = "52" arrow-schema = "52" [workspace.lints.clippy] diff --git a/arrow-open-variant/Cargo.toml b/arrow-open-variant/Cargo.toml index e71e4f7..a2e249e 100644 --- a/arrow-open-variant/Cargo.toml +++ b/arrow-open-variant/Cargo.toml @@ -13,6 +13,7 @@ rust-version = "1.70" [dependencies] arrow-array.workspace = true arrow-buffer.workspace = true +arrow-cast.workspace = true arrow-schema.workspace = true open-variant = { path = "../open-variant" } diff --git a/arrow-open-variant/src/array.rs b/arrow-open-variant/src/array.rs new file mode 100644 index 0000000..f564615 --- /dev/null +++ b/arrow-open-variant/src/array.rs @@ -0,0 +1,65 @@ +use arrow_array::{cast::AsArray, types::Int8Type, Array, BinaryArray, Int8Array}; +use arrow_schema::ArrowError; +use open_variant::{metadata::MetadataRef, values::VariantRef}; + +use crate::variant_type; + +/// A wrapper around a `StructArray` that represents a variant array. +pub struct VariantArray<'a> { + /// All the unique metadatas. + metadatas: Vec>, + /// Indices into `metadatas` for each value. + metadata_indices: &'a Int8Array, + /// Array with the variant data + values: &'a BinaryArray, +} + +impl<'a> VariantArray<'a> { + pub fn try_new(array: &'a dyn Array) -> Result, ArrowError> { + // Validate it's the right type. + if array.data_type() != &variant_type() { + return Err(ArrowError::InvalidArgumentError(format!( + "Expected a variant array, got {:?}", + array.data_type() + ))); + } + let struct_array = array.as_struct(); + let metadata_array = struct_array.column(0).as_dictionary::(); + let metadata_indices = metadata_array.keys(); + + let metadatas = metadata_array + .values() + .as_binary::() + .iter() + .filter_map(|v| v.map(MetadataRef::new)) + .collect(); + + let values = struct_array.column(1).as_binary::(); + + Ok(Self { + metadatas, + metadata_indices, + values, + }) + } + + pub fn metadata(&self, index: usize) -> Option<&MetadataRef> { + if self.metadata_indices.is_null(index) { + None + } else { + let index = self.metadata_indices.value(index); + self.metadatas.get(index as usize) + } + } + + pub fn value(&self, index: usize) -> Result, ArrowError> { + if self.values.is_null(index) { + Ok(None) + } else { + let value = self.values.value(index); + Ok(Some( + VariantRef::try_new(value).map_err(ArrowError::ParseError)?, + )) + } + } +} diff --git a/arrow-open-variant/src/cast.rs b/arrow-open-variant/src/cast.rs new file mode 100644 index 0000000..c3f922c --- /dev/null +++ b/arrow-open-variant/src/cast.rs @@ -0,0 +1,77 @@ +//! Cast Arrow data types to Variant type. + +use std::sync::Arc; + +use arrow_array::{ + builder::BinaryBuilder, cast::AsArray, Array, ArrayRef, BinaryArray, BooleanArray, StructArray, +}; +use arrow_cast::cast::CastOptions; +use arrow_schema::{ArrowError, DataType}; +use open_variant::{metadata::build_metadata, values::write::serialize_bool}; + +use crate::{utils::make_repeated_dict_array, variant_fields}; + +pub fn cast_to_variant(array: &dyn Array, _options: &CastOptions) -> Result { + match array.data_type() { + DataType::Boolean => cast_to_variant_bool(array.as_boolean()), + _ => Err(ArrowError::NotYetImplemented(format!( + "Casting {:?} to Variant", + array.data_type() + ))), + } +} + +fn cast_to_variant_bool(array: &BooleanArray) -> Result { + let metadata = empty_metadata(array.len()); + + let mut values = BinaryBuilder::with_capacity( + array.len(), + array.len() - array.null_count(), // Each value is a single byte + ); + + for i in 0..array.len() { + if array.is_null(i) { + values.append_null(); + } else { + let value = array.value(i); + values.append_value([serialize_bool(value)]); + } + } + + let values = values.finish(); + + let null_buffer = values.nulls().cloned(); + Ok(Arc::new(StructArray::new( + variant_fields(), + vec![metadata, Arc::new(values) as ArrayRef], + null_buffer, + )) as ArrayRef) +} + +fn empty_metadata(len: usize) -> ArrayRef { + let metadata = build_metadata(std::iter::empty()); + let metadata = BinaryArray::new_scalar(metadata); + make_repeated_dict_array(metadata, len) +} + +#[cfg(test)] +mod tests { + use arrow_array::BooleanArray; + + use crate::array::VariantArray; + + use super::*; + + #[test] + fn test_bool_to_variant() { + let data = BooleanArray::from_iter(vec![Some(true), Some(false), None]); + let options = CastOptions::default(); + let result = cast_to_variant(&data, &options).unwrap(); + assert_eq!(result.len(), 3); + + let variant = VariantArray::try_new(&result).unwrap(); + assert!(variant.value(0).unwrap().unwrap().get_bool()); + assert!(!variant.value(1).unwrap().unwrap().get_bool()); + assert!(variant.value(2).unwrap().is_none()); + } +} diff --git a/arrow-open-variant/src/json.rs b/arrow-open-variant/src/json.rs index b0c124f..fc4901a 100644 --- a/arrow-open-variant/src/json.rs +++ b/arrow-open-variant/src/json.rs @@ -4,15 +4,16 @@ use std::borrow::Cow; use std::{collections::BTreeSet, sync::Arc}; use arrow_array::builder::BinaryBuilder; -use arrow_array::{ - cast::AsArray, Array, ArrayRef, BinaryArray, DictionaryArray, Scalar, StructArray, -}; +use arrow_array::{cast::AsArray, Array, ArrayRef, BinaryArray, StructArray}; use arrow_buffer::NullBuffer; -use arrow_schema::{ArrowError, DataType, Field}; +use arrow_schema::{ArrowError, DataType}; use jiter::JsonValue; use open_variant::metadata::{build_metadata, MetadataRef}; use open_variant::values::write::{self, ArrayBuilder, ObjectBuilder}; +use crate::utils::make_repeated_dict_array; +use crate::variant_fields; + /// Create a variant array from an array of JSON data. /// /// JSON data can be objects, arrays, strings, numbers, booleans, and nulls. @@ -64,17 +65,9 @@ pub fn variant_from_json(array: &dyn Array) -> Result { let data: BinaryArray = values_from_json(jsons_ref, array.null_count(), array.nulls(), &metadata_ref)?; - let fields = vec![ - Field::new( - "metadata", - DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Binary)), - false, - ), - Field::new("values", DataType::Binary, true), - ]; let null_buffer = data.nulls().cloned(); Ok(Arc::new(StructArray::new( - fields.into(), + variant_fields(), vec![metadata, Arc::new(data) as ArrayRef], null_buffer, )) as ArrayRef) @@ -162,13 +155,6 @@ fn collect_all_keys<'a>(jsons: &[JsonValue<'a>]) -> Result Ok(seen) } -fn make_repeated_dict_array(scalar: Scalar, length: usize) -> ArrayRef { - let dict_keys = std::iter::repeat(0_i8).take(length).collect::>(); - let metadata = - DictionaryArray::new(dict_keys.into(), Arc::new(scalar.into_inner()) as ArrayRef); - Arc::new(metadata) -} - fn values_from_json( jsons: &[jiter::JsonValue], null_count: usize, @@ -250,6 +236,7 @@ mod tests { use arrow_array::{ types::Int8Type, BinaryViewArray, Int8Array, LargeStringArray, StringArray, StringViewArray, }; + use arrow_schema::Field; use open_variant::values::{BasicType, PrimitiveTypeId, VariantRef}; use super::*; diff --git a/arrow-open-variant/src/lib.rs b/arrow-open-variant/src/lib.rs index c002109..9ccea91 100644 --- a/arrow-open-variant/src/lib.rs +++ b/arrow-open-variant/src/lib.rs @@ -1,2 +1,35 @@ +mod array; +mod cast; #[cfg(feature = "json")] pub mod json; +mod utils; + +pub use array::VariantArray; +use arrow_schema::{DataType, Field, Fields}; +pub use cast::cast_to_variant; + +pub const VARIANT_METADATA_FIELD: &str = "metadata"; +pub const VARIANT_VALUES_FIELD: &str = "values"; + +pub fn variant_metadata_type() -> DataType { + // TODO: can we be flexible about this type? + // TODO: should we use REE for this? + DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Binary)) +} + +pub fn variant_values_type() -> DataType { + // TODO: BinaryView? + DataType::Binary +} + +fn variant_fields() -> Fields { + vec![ + Field::new(VARIANT_METADATA_FIELD, variant_metadata_type(), false), + Field::new(VARIANT_VALUES_FIELD, variant_values_type(), true), + ] + .into() +} + +pub fn variant_type() -> DataType { + DataType::Struct(variant_fields()) +} diff --git a/arrow-open-variant/src/utils.rs b/arrow-open-variant/src/utils.rs new file mode 100644 index 0000000..ae70ccb --- /dev/null +++ b/arrow-open-variant/src/utils.rs @@ -0,0 +1,10 @@ +use std::sync::Arc; + +use arrow_array::{ArrayRef, BinaryArray, DictionaryArray, Scalar}; + +pub fn make_repeated_dict_array(scalar: Scalar, length: usize) -> ArrayRef { + let dict_keys = std::iter::repeat(0_i8).take(length).collect::>(); + let metadata = + DictionaryArray::new(dict_keys.into(), Arc::new(scalar.into_inner()) as ArrayRef); + Arc::new(metadata) +} diff --git a/open-variant/src/values/write.rs b/open-variant/src/values/write.rs index 0066b79..5dc1550 100644 --- a/open-variant/src/values/write.rs +++ b/open-variant/src/values/write.rs @@ -11,11 +11,22 @@ fn primitive_header(primitive_type_id: PrimitiveTypeId) -> u8 { basic_type | (primitive_type_id as u8) << 2 } +pub fn serialize_null() -> u8 { + primitive_header(PrimitiveTypeId::Null) +} + pub fn write_null(buffer: &mut Vec) { let header = primitive_header(PrimitiveTypeId::Null); buffer.push(header); } +pub fn serialize_bool(value: bool) -> u8 { + match value { + true => primitive_header(PrimitiveTypeId::BoolTrue), + false => primitive_header(PrimitiveTypeId::BoolFalse), + } +} + pub fn write_bool(buffer: &mut Vec, value: bool) { // Booleans are just headers let header = match value {