Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: cast boolean to variant #10

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ resolver = "2"
[workspace.dependencies]
arrow-array = "52"
arrow-buffer = "52"
arrow-cast = "52"
arrow-schema = "52"

[workspace.lints.clippy]
Expand Down
1 change: 1 addition & 0 deletions arrow-open-variant/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ rust-version = "1.70"
[dependencies]
arrow-array.workspace = true
arrow-buffer.workspace = true
arrow-cast.workspace = true
arrow-schema.workspace = true
open-variant = { path = "../open-variant" }

Expand Down
65 changes: 65 additions & 0 deletions arrow-open-variant/src/array.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
use arrow_array::{cast::AsArray, types::Int8Type, Array, BinaryArray, Int8Array};
use arrow_schema::ArrowError;
use open_variant::{metadata::MetadataRef, values::VariantRef};

use crate::variant_type;

/// A wrapper around a `StructArray` that represents a variant array.
pub struct VariantArray<'a> {
/// All the unique metadatas.
metadatas: Vec<MetadataRef<'a>>,
/// Indices into `metadatas` for each value.
metadata_indices: &'a Int8Array,
/// Array with the variant data
values: &'a BinaryArray,
}

impl<'a> VariantArray<'a> {
pub fn try_new(array: &'a dyn Array) -> Result<VariantArray<'a>, ArrowError> {
// Validate it's the right type.
if array.data_type() != &variant_type() {
return Err(ArrowError::InvalidArgumentError(format!(
"Expected a variant array, got {:?}",
array.data_type()
)));
}
let struct_array = array.as_struct();
let metadata_array = struct_array.column(0).as_dictionary::<Int8Type>();
let metadata_indices = metadata_array.keys();

let metadatas = metadata_array
.values()
.as_binary::<i32>()
.iter()
.filter_map(|v| v.map(MetadataRef::new))
.collect();

let values = struct_array.column(1).as_binary::<i32>();

Ok(Self {
metadatas,
metadata_indices,
values,
})
}

pub fn metadata(&self, index: usize) -> Option<&MetadataRef> {
if self.metadata_indices.is_null(index) {
None
} else {
let index = self.metadata_indices.value(index);
self.metadatas.get(index as usize)
}
}

pub fn value(&self, index: usize) -> Result<Option<VariantRef>, ArrowError> {
if self.values.is_null(index) {
Ok(None)
} else {
let value = self.values.value(index);
Ok(Some(
VariantRef::try_new(value).map_err(ArrowError::ParseError)?,
))
}
}
}
77 changes: 77 additions & 0 deletions arrow-open-variant/src/cast.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
//! Cast Arrow data types to Variant type.

use std::sync::Arc;

use arrow_array::{
builder::BinaryBuilder, cast::AsArray, Array, ArrayRef, BinaryArray, BooleanArray, StructArray,
};
use arrow_cast::cast::CastOptions;
use arrow_schema::{ArrowError, DataType};
use open_variant::{metadata::build_metadata, values::write::serialize_bool};

use crate::{utils::make_repeated_dict_array, variant_fields};

pub fn cast_to_variant(array: &dyn Array, _options: &CastOptions) -> Result<ArrayRef, ArrowError> {
match array.data_type() {
DataType::Boolean => cast_to_variant_bool(array.as_boolean()),
_ => Err(ArrowError::NotYetImplemented(format!(
"Casting {:?} to Variant",
array.data_type()
))),
}
}

fn cast_to_variant_bool(array: &BooleanArray) -> Result<ArrayRef, ArrowError> {
let metadata = empty_metadata(array.len());

let mut values = BinaryBuilder::with_capacity(
array.len(),
array.len() - array.null_count(), // Each value is a single byte
);

for i in 0..array.len() {
if array.is_null(i) {
values.append_null();
} else {
let value = array.value(i);
values.append_value([serialize_bool(value)]);
}
}

let values = values.finish();

let null_buffer = values.nulls().cloned();
Ok(Arc::new(StructArray::new(
variant_fields(),
vec![metadata, Arc::new(values) as ArrayRef],
null_buffer,
)) as ArrayRef)
}

fn empty_metadata(len: usize) -> ArrayRef {
let metadata = build_metadata(std::iter::empty());
let metadata = BinaryArray::new_scalar(metadata);
make_repeated_dict_array(metadata, len)
}

#[cfg(test)]
mod tests {
use arrow_array::BooleanArray;

use crate::array::VariantArray;

use super::*;

#[test]
fn test_bool_to_variant() {
let data = BooleanArray::from_iter(vec![Some(true), Some(false), None]);
let options = CastOptions::default();
let result = cast_to_variant(&data, &options).unwrap();
assert_eq!(result.len(), 3);

let variant = VariantArray::try_new(&result).unwrap();
assert!(variant.value(0).unwrap().unwrap().get_bool());
assert!(!variant.value(1).unwrap().unwrap().get_bool());
assert!(variant.value(2).unwrap().is_none());
}
}
27 changes: 7 additions & 20 deletions arrow-open-variant/src/json.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,16 @@ use std::borrow::Cow;
use std::{collections::BTreeSet, sync::Arc};

use arrow_array::builder::BinaryBuilder;
use arrow_array::{
cast::AsArray, Array, ArrayRef, BinaryArray, DictionaryArray, Scalar, StructArray,
};
use arrow_array::{cast::AsArray, Array, ArrayRef, BinaryArray, StructArray};
use arrow_buffer::NullBuffer;
use arrow_schema::{ArrowError, DataType, Field};
use arrow_schema::{ArrowError, DataType};
use jiter::JsonValue;
use open_variant::metadata::{build_metadata, MetadataRef};
use open_variant::values::write::{self, ArrayBuilder, ObjectBuilder};

use crate::utils::make_repeated_dict_array;
use crate::variant_fields;

/// Create a variant array from an array of JSON data.
///
/// JSON data can be objects, arrays, strings, numbers, booleans, and nulls.
Expand Down Expand Up @@ -64,17 +65,9 @@ pub fn variant_from_json(array: &dyn Array) -> Result<ArrayRef, ArrowError> {

let data: BinaryArray =
values_from_json(jsons_ref, array.null_count(), array.nulls(), &metadata_ref)?;
let fields = vec![
Field::new(
"metadata",
DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Binary)),
false,
),
Field::new("values", DataType::Binary, true),
];
let null_buffer = data.nulls().cloned();
Ok(Arc::new(StructArray::new(
fields.into(),
variant_fields(),
vec![metadata, Arc::new(data) as ArrayRef],
null_buffer,
)) as ArrayRef)
Expand Down Expand Up @@ -162,13 +155,6 @@ fn collect_all_keys<'a>(jsons: &[JsonValue<'a>]) -> Result<BTreeSet<Cow<'a, str>
Ok(seen)
}

fn make_repeated_dict_array(scalar: Scalar<BinaryArray>, length: usize) -> ArrayRef {
let dict_keys = std::iter::repeat(0_i8).take(length).collect::<Vec<_>>();
let metadata =
DictionaryArray::new(dict_keys.into(), Arc::new(scalar.into_inner()) as ArrayRef);
Arc::new(metadata)
}

fn values_from_json(
jsons: &[jiter::JsonValue],
null_count: usize,
Expand Down Expand Up @@ -250,6 +236,7 @@ mod tests {
use arrow_array::{
types::Int8Type, BinaryViewArray, Int8Array, LargeStringArray, StringArray, StringViewArray,
};
use arrow_schema::Field;
use open_variant::values::{BasicType, PrimitiveTypeId, VariantRef};

use super::*;
Expand Down
33 changes: 33 additions & 0 deletions arrow-open-variant/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,2 +1,35 @@
mod array;
mod cast;
#[cfg(feature = "json")]
pub mod json;
mod utils;

pub use array::VariantArray;
use arrow_schema::{DataType, Field, Fields};
pub use cast::cast_to_variant;

pub const VARIANT_METADATA_FIELD: &str = "metadata";
pub const VARIANT_VALUES_FIELD: &str = "values";

pub fn variant_metadata_type() -> DataType {
// TODO: can we be flexible about this type?
// TODO: should we use REE for this?
DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Binary))
}

pub fn variant_values_type() -> DataType {
// TODO: BinaryView?
DataType::Binary
}

fn variant_fields() -> Fields {
vec![
Field::new(VARIANT_METADATA_FIELD, variant_metadata_type(), false),
Field::new(VARIANT_VALUES_FIELD, variant_values_type(), true),
]
.into()
}

pub fn variant_type() -> DataType {
DataType::Struct(variant_fields())
}
10 changes: 10 additions & 0 deletions arrow-open-variant/src/utils.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
use std::sync::Arc;

use arrow_array::{ArrayRef, BinaryArray, DictionaryArray, Scalar};

pub fn make_repeated_dict_array(scalar: Scalar<BinaryArray>, length: usize) -> ArrayRef {
let dict_keys = std::iter::repeat(0_i8).take(length).collect::<Vec<_>>();
let metadata =
DictionaryArray::new(dict_keys.into(), Arc::new(scalar.into_inner()) as ArrayRef);
Arc::new(metadata)
}
11 changes: 11 additions & 0 deletions open-variant/src/values/write.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,22 @@ fn primitive_header(primitive_type_id: PrimitiveTypeId) -> u8 {
basic_type | (primitive_type_id as u8) << 2
}

pub fn serialize_null() -> u8 {
primitive_header(PrimitiveTypeId::Null)
}

pub fn write_null(buffer: &mut Vec<u8>) {
let header = primitive_header(PrimitiveTypeId::Null);
buffer.push(header);
}

pub fn serialize_bool(value: bool) -> u8 {
match value {
true => primitive_header(PrimitiveTypeId::BoolTrue),
false => primitive_header(PrimitiveTypeId::BoolFalse),
}
}

pub fn write_bool(buffer: &mut Vec<u8>, value: bool) {
// Booleans are just headers
let header = match value {
Expand Down
Loading