From e6c07fc05f793b7d972b0d67c56e1cb8782164b7 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Sat, 25 Apr 2026 08:56:21 -0400 Subject: [PATCH 1/3] . --- datafusion/functions/src/string/common.rs | 210 +++++++++++++++++----- datafusion/functions/src/string/lower.rs | 140 +++++++++++++++ datafusion/functions/src/strings.rs | 25 ++- 3 files changed, 314 insertions(+), 61 deletions(-) diff --git a/datafusion/functions/src/string/common.rs b/datafusion/functions/src/string/common.rs index 70699c3653c4b..6a673d3a22c0d 100644 --- a/datafusion/functions/src/string/common.rs +++ b/datafusion/functions/src/string/common.rs @@ -19,7 +19,10 @@ use std::sync::Arc; -use crate::strings::{GenericStringArrayBuilder, StringViewArrayBuilder, append_view}; +use crate::strings::{ + GenericStringArrayBuilder, STRING_VIEW_INIT_BLOCK_SIZE, STRING_VIEW_MAX_BLOCK_SIZE, + StringViewArrayBuilder, append_view, +}; use arrow::array::{ Array, ArrayRef, GenericStringArray, NullBufferBuilder, OffsetSizeTrait, StringViewArray, new_null_array, @@ -323,32 +326,42 @@ fn string_trim(args: &[ArrayRef]) -> Result Result { - case_conversion(args, |string| string.to_lowercase(), name) + case_conversion(args, true, name) } pub(crate) fn to_upper(args: &[ColumnarValue], name: &str) -> Result { - case_conversion(args, |string| string.to_uppercase(), name) + case_conversion(args, false, name) +} + +#[inline] +fn unicode_case(s: &str, lower: bool) -> String { + if lower { + s.to_lowercase() + } else { + s.to_uppercase() + } } -fn case_conversion<'a, F>( - args: &'a [ColumnarValue], - op: F, +fn case_conversion( + args: &[ColumnarValue], + lower: bool, name: &str, -) -> Result -where - F: Fn(&'a str) -> String, -{ +) -> Result { match &args[0] { ColumnarValue::Array(array) => match array.data_type() { - DataType::Utf8 => Ok(ColumnarValue::Array(case_conversion_array::( - array, op, + DataType::Utf8 => Ok(ColumnarValue::Array(case_conversion_array::( + array, lower, )?)), - DataType::LargeUtf8 => Ok(ColumnarValue::Array(case_conversion_array::< - i64, - _, - >(array, op)?)), + DataType::LargeUtf8 => Ok(ColumnarValue::Array( + case_conversion_array::(array, lower)?, + )), DataType::Utf8View => { let string_array = as_string_view_array(array)?; + if string_array.is_ascii() { + return Ok(ColumnarValue::Array(Arc::new( + case_conversion_utf8view_ascii(string_array, lower), + ))); + } let item_len = string_array.len(); // Null-preserving: reuse the input null buffer as the output null buffer. let nulls = string_array.nulls().cloned(); @@ -361,14 +374,14 @@ where } else { // SAFETY: `n.is_null(i)` was false in the branch above. let s = unsafe { string_array.value_unchecked(i) }; - builder.append_value(&op(s)); + builder.append_value(&unicode_case(s, lower)); } } } else { for i in 0..item_len { // SAFETY: no null buffer means every index is valid. let s = unsafe { string_array.value_unchecked(i) }; - builder.append_value(&op(s)); + builder.append_value(&unicode_case(s, lower)); } } @@ -378,15 +391,15 @@ where }, ColumnarValue::Scalar(scalar) => match scalar { ScalarValue::Utf8(a) => { - let result = a.as_ref().map(|x| op(x)); + let result = a.as_ref().map(|x| unicode_case(x, lower)); Ok(ColumnarValue::Scalar(ScalarValue::Utf8(result))) } ScalarValue::LargeUtf8(a) => { - let result = a.as_ref().map(|x| op(x)); + let result = a.as_ref().map(|x| unicode_case(x, lower)); Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(result))) } ScalarValue::Utf8View(a) => { - let result = a.as_ref().map(|x| op(x)); + let result = a.as_ref().map(|x| unicode_case(x, lower)); Ok(ColumnarValue::Scalar(ScalarValue::Utf8View(result))) } other => exec_err!("Unsupported data type {other:?} for function {name}"), @@ -394,16 +407,15 @@ where } } -fn case_conversion_array<'a, O, F>(array: &'a ArrayRef, op: F) -> Result -where - O: OffsetSizeTrait, - F: Fn(&'a str) -> String, -{ +fn case_conversion_array( + array: &ArrayRef, + lower: bool, +) -> Result { const PRE_ALLOC_BYTES: usize = 8; let string_array = as_generic_string_array::(array)?; if string_array.is_ascii() { - return case_conversion_ascii_array::(string_array, op); + return case_conversion_ascii_array::(string_array, lower); } // Values contain non-ASCII. @@ -423,43 +435,147 @@ where } else { // SAFETY: `n.is_null(i)` was false in the branch above. let s = unsafe { string_array.value_unchecked(i) }; - builder.append_value(&op(s)); + builder.append_value(&unicode_case(s, lower)); } } } else { for i in 0..item_len { // SAFETY: no null buffer means every index is valid. let s = unsafe { string_array.value_unchecked(i) }; - builder.append_value(&op(s)); + builder.append_value(&unicode_case(s, lower)); } } Ok(Arc::new(builder.finish(nulls)?)) } +/// Fast path for case conversion on an all-ASCII `StringViewArray`. +fn case_conversion_utf8view_ascii( + array: &StringViewArray, + lower: bool, +) -> StringViewArray { + // Specialize per conversion so the byte call inlines in the hot loops below. + if lower { + case_conversion_utf8view_ascii_inner(array, u8::to_ascii_lowercase) + } else { + case_conversion_utf8view_ascii_inner(array, u8::to_ascii_uppercase) + } +} + +/// Walks the views once: inline rows (length ≤ 12) convert their inline bytes +/// in place; long rows copy their referenced bytes into a single packed output +/// buffer while converting, then rewrite the view (`buffer_index = 0`, new +/// offset, new 4-byte prefix) to point at it. +fn case_conversion_utf8view_ascii_inner u8>( + array: &StringViewArray, + convert: F, +) -> StringViewArray { + let item_len = array.len(); + let views = array.views(); + let data_buffers = array.data_buffers(); + let nulls = array.nulls(); + + let mut new_views: Vec = Vec::with_capacity(item_len); + let mut in_progress: Vec = Vec::new(); + let mut completed: Vec = Vec::new(); + let mut block_size: u32 = STRING_VIEW_INIT_BLOCK_SIZE; + + for i in 0..item_len { + if nulls.is_some_and(|n| n.is_null(i)) { + new_views.push(0); + continue; + } + let view = views[i]; + let len = view as u32 as usize; + if len == 0 { + new_views.push(0); + continue; + } + let mut bytes = view.to_le_bytes(); + if len <= 12 { + // Inline row: convert the inline data bytes; layout unchanged. + for b in &mut bytes[4..4 + len] { + *b = convert(b); + } + new_views.push(u128::from_le_bytes(bytes)); + } else { + // Make sure the current data block has room for this value; + // otherwise flush and start a new, larger block. + let required_cap = in_progress.len() + len; + if in_progress.capacity() < required_cap { + if !in_progress.is_empty() { + completed.push(Buffer::from_vec(std::mem::take(&mut in_progress))); + } + if block_size < STRING_VIEW_MAX_BLOCK_SIZE { + block_size = block_size.saturating_mul(2); + } + let to_reserve = len.max(block_size as usize); + in_progress.reserve(to_reserve); + } + + let buffer_index: u32 = i32::try_from(completed.len()) + .expect("buffer count exceeds i32::MAX") + as u32; + let new_offset: u32 = + i32::try_from(in_progress.len()).expect("offset exceeds i32::MAX") as u32; + + let src_buffer_index = + u32::from_le_bytes(bytes[8..12].try_into().unwrap()) as usize; + let src_offset = + u32::from_le_bytes(bytes[12..16].try_into().unwrap()) as usize; + let src = + &data_buffers[src_buffer_index].as_slice()[src_offset..src_offset + len]; + + let prefix_start = in_progress.len(); + in_progress.extend(src.iter().map(&convert)); + + // Prefix is the first 4 bytes of the converted data we just wrote. + let prefix: [u8; 4] = in_progress[prefix_start..prefix_start + 4] + .try_into() + .unwrap(); + bytes[4..8].copy_from_slice(&prefix); + bytes[8..12].copy_from_slice(&buffer_index.to_le_bytes()); + bytes[12..16].copy_from_slice(&new_offset.to_le_bytes()); + new_views.push(u128::from_le_bytes(bytes)); + } + } + + if !in_progress.is_empty() { + completed.push(Buffer::from_vec(in_progress)); + } + + // SAFETY: each long view's buffer_index addresses a buffer we wrote, and + // its offset addresses bytes within that buffer; prefixes were copied from + // those same bytes; inline views were rewritten from valid inline bytes; + // null/empty rows are zero views with no buffer reference; row count is + // unchanged. + unsafe { + StringViewArray::new_unchecked( + ScalarBuffer::from(new_views), + completed, + array.nulls().cloned(), + ) + } +} + /// Fast path for case conversion on an all-ASCII string array. ASCII case /// conversion is byte-length-preserving, so we can convert the entire addressed -/// range in one call and reuse the offsets and nulls buffers — rebasing the -/// offsets when the input is a sliced array. -fn case_conversion_ascii_array<'a, O, F>( - string_array: &'a GenericStringArray, - op: F, -) -> Result -where - O: OffsetSizeTrait, - F: Fn(&'a str) -> String, -{ +/// byte range in one pass over the value buffer and reuse the offsets and nulls +/// buffers — rebasing the offsets when the input is a sliced array. +fn case_conversion_ascii_array( + string_array: &GenericStringArray, + lower: bool, +) -> Result { let value_offsets = string_array.value_offsets(); let start = value_offsets.first().unwrap().as_usize(); let end = value_offsets.last().unwrap().as_usize(); let relevant = &string_array.value_data()[start..end]; - // SAFETY: `relevant` is a subslice of the string array's value buffer, - // which is valid UTF-8. - let str_values = unsafe { std::str::from_utf8_unchecked(relevant) }; - - let converted_values = op(str_values); - debug_assert_eq!(converted_values.len(), str_values.len()); - let values = Buffer::from_vec(converted_values.into_bytes()); + let converted: Vec = if lower { + relevant.iter().map(u8::to_ascii_lowercase).collect() + } else { + relevant.iter().map(u8::to_ascii_uppercase).collect() + }; + let values = Buffer::from_vec(converted); // Shift offsets from `start`-based to 0-based so they index into `values`. let offsets = if start == 0 { @@ -468,7 +584,7 @@ where let s = O::usize_as(start); let rebased: Vec = value_offsets.iter().map(|&o| o - s).collect(); // SAFETY: subtracting a constant from monotonic offsets preserves - // monotonicity, and `start` is the minimum offset so no underflow. + // monotonicity, and `start` is the minimum offset, so no underflow. unsafe { OffsetBuffer::new_unchecked(ScalarBuffer::from(rebased)) } }; diff --git a/datafusion/functions/src/string/lower.rs b/datafusion/functions/src/string/lower.rs index 0b0476d0ec712..57cbe1d8779f0 100644 --- a/datafusion/functions/src/string/lower.rs +++ b/datafusion/functions/src/string/lower.rs @@ -210,6 +210,146 @@ mod tests { to_lower(input, expected) } + #[test] + fn lower_ascii_utf8view() -> Result<()> { + // Mix of inlined (≤12 bytes) and referenced (>12 bytes) strings, plus + // a null and an empty, to exercise the all-ASCII Utf8View fast path. + let input = Arc::new(StringViewArray::from(vec![ + Some("ARROW"), // inlined short + None, + Some("HELLO WORLD 123"), // referenced (15 bytes) + Some(""), + Some("0123456789"), // inlined, no case change + Some("DATAFUSION IS COOL"), // referenced + ])) as ArrayRef; + + let expected = Arc::new(StringViewArray::from(vec![ + Some("arrow"), + None, + Some("hello world 123"), + Some(""), + Some("0123456789"), + Some("datafusion is cool"), + ])) as ArrayRef; + + to_lower(input, expected) + } + + #[test] + fn lower_sliced_ascii_utf8view() -> Result<()> { + // Slice of a parent that contains a non-ASCII string outside the + // slice. The slice is all-ASCII, so the fast path must run and produce + // correct output while the parent's unaddressed non-ASCII bytes are + // irrelevant to the result. + let parent = Arc::new(StringViewArray::from(vec![ + Some("农历新年LONG ENOUGH FOR BUFFER"), + Some("HELLO WORLD 123"), + Some("DATAFUSION ROCKS!"), + Some("ZZZZZZZZZZZZZZZZ"), + ])) as ArrayRef; + let sliced = parent.slice(1, 2); + let result = invoke_lower(sliced)?; + let result_sv = result.as_any().downcast_ref::().unwrap(); + + let expected = StringViewArray::from(vec![ + Some("hello world 123"), + Some("datafusion rocks!"), + ]); + assert_eq!(result_sv, &expected); + // The slice's two long views address 15 + 17 = 32 bytes; the ASCII + // fast path must produce a single packed buffer of exactly that + // size, not one scaled to the parent's data buffer. + assert_eq!(result_sv.data_buffers().len(), 1); + assert_eq!(result_sv.data_buffers()[0].len(), 32); + Ok(()) + } + + #[test] + fn lower_utf8view_inline_only_no_buffers() -> Result<()> { + // An array whose values are all ≤ 12 bytes is fully inline; the ASCII + // fast path should produce no data buffers at all. + let input = Arc::new(StringViewArray::from(vec![ + Some("HELLO"), + None, + Some(""), + Some("0123456789ab"), // 12 bytes — inline boundary + ])) as ArrayRef; + let result = invoke_lower(input)?; + let result_sv = result.as_any().downcast_ref::().unwrap(); + + let expected = StringViewArray::from(vec![ + Some("hello"), + None, + Some(""), + Some("0123456789ab"), + ]); + assert_eq!(result_sv, &expected); + assert_eq!( + result_sv.data_buffers().len(), + 0, + "inline-only Utf8View should produce no data buffers" + ); + Ok(()) + } + + #[test] + fn lower_utf8view_long_packs_tight() -> Result<()> { + // Mix of long and inline values; the long values should be packed into + // a single tight output buffer whose size is exactly the sum of their + // lengths (inline values do not contribute). + let input = Arc::new(StringViewArray::from(vec![ + Some("HELLO WORLD 123"), // 15 bytes (long) + Some("ABC"), // inline + None, + Some("DATAFUSION ROCKS!"), // 17 bytes (long) + Some("ANOTHER LONG STRING"), // 19 bytes (long) + ])) as ArrayRef; + let result = invoke_lower(input)?; + let result_sv = result.as_any().downcast_ref::().unwrap(); + + let expected = StringViewArray::from(vec![ + Some("hello world 123"), + Some("abc"), + None, + Some("datafusion rocks!"), + Some("another long string"), + ]); + assert_eq!(result_sv, &expected); + assert_eq!(result_sv.data_buffers().len(), 1); + assert_eq!(result_sv.data_buffers()[0].len(), 15 + 17 + 19); + Ok(()) + } + + #[test] + fn lower_utf8view_splits_into_multiple_buffers() -> Result<()> { + // Produce enough long-string output to overflow the first data block + // (≈16 KiB after the initial doubling) and confirm the fast path + // splits across buffers rather than packing everything into one and + // risking the i32::MAX offset limit. + const STR_LEN: usize = 500; + const N: usize = 40; // 40 × 500 B = 20 KiB total — crosses the first block. + let value = "X".repeat(STR_LEN); + let inputs: Vec> = (0..N).map(|_| Some(value.clone())).collect(); + let input = Arc::new(StringViewArray::from(inputs.clone())) as ArrayRef; + let result = invoke_lower(input)?; + let result_sv = result.as_any().downcast_ref::().unwrap(); + + let expected_value = "x".repeat(STR_LEN); + let expected: Vec> = + (0..N).map(|_| Some(expected_value.as_str())).collect(); + assert_eq!(result_sv, &StringViewArray::from(expected)); + assert!( + result_sv.data_buffers().len() >= 2, + "expected the output to span more than one data buffer, got {}", + result_sv.data_buffers().len() + ); + // Total bytes across buffers must equal total long-value bytes + // (no row was inlined since each value is > 12 bytes). + let total: usize = result_sv.data_buffers().iter().map(|b| b.len()).sum(); + assert_eq!(total, N * STR_LEN); + Ok(()) + } + #[test] fn lower_sliced_utf8() -> Result<()> { let parent = Arc::new(StringArray::from(vec![ diff --git a/datafusion/functions/src/strings.rs b/datafusion/functions/src/strings.rs index 8d9109bc2ab60..872539854a9eb 100644 --- a/datafusion/functions/src/strings.rs +++ b/datafusion/functions/src/strings.rs @@ -531,12 +531,12 @@ impl GenericStringArrayBuilder { } } -/// Starting size for the long-string data block; matches Arrow's -/// `GenericByteViewBuilder` default. -const STARTING_BLOCK_SIZE: u32 = 8 * 1024; -/// Maximum size each long-string data block grows to; matches Arrow's -/// `GenericByteViewBuilder` default. -const MAX_BLOCK_SIZE: u32 = 2 * 1024 * 1024; +/// Starting size for the long-string data block used by `StringView`-style +/// arrays; matches Arrow's `GenericByteViewBuilder` default. +pub(crate) const STRING_VIEW_INIT_BLOCK_SIZE: u32 = 8 * 1024; +/// Maximum size each long-string data block in a `StringView`-style array +/// grows to; matches Arrow's `GenericByteViewBuilder` default. +pub(crate) const STRING_VIEW_MAX_BLOCK_SIZE: u32 = 2 * 1024 * 1024; /// Builder for a [`StringViewArray`] that defers null tracking to `finish`. /// @@ -545,13 +545,11 @@ const MAX_BLOCK_SIZE: u32 = 2 * 1024 * 1024; /// Short strings (≤ 12 bytes) are inlined into the view itself; long strings /// are appended into an in-progress data block. When the in-progress block /// fills up it is flushed into `completed` and a new block — double the size -/// of the last, capped at [`MAX_BLOCK_SIZE`] — is started. +/// of the last, capped at [`STRING_VIEW_MAX_BLOCK_SIZE`] — is started. pub(crate) struct StringViewArrayBuilder { views: Vec, in_progress: Vec, completed: Vec, - /// Current block-size target; doubles each time a block is flushed, up to - /// [`MAX_BLOCK_SIZE`]. block_size: u32, placeholder_count: usize, } @@ -562,15 +560,14 @@ impl StringViewArrayBuilder { views: Vec::with_capacity(item_capacity), in_progress: Vec::new(), completed: Vec::new(), - block_size: STARTING_BLOCK_SIZE, + block_size: STRING_VIEW_INIT_BLOCK_SIZE, placeholder_count: 0, } } - /// Doubles the block-size target (capped at [`MAX_BLOCK_SIZE`]) and - /// returns the new size. The first call returns `2 * STARTING_BLOCK_SIZE`. + /// Doubles the block-size target and returns the new size. fn next_block_size(&mut self) -> u32 { - if self.block_size < MAX_BLOCK_SIZE { + if self.block_size < STRING_VIEW_MAX_BLOCK_SIZE { self.block_size = self.block_size.saturating_mul(2); } self.block_size @@ -988,7 +985,7 @@ mod tests { #[test] fn string_view_array_builder_flushes_full_blocks() { - // Each value is 300 bytes. The first data block is 2 × STARTING_BLOCK_SIZE + // Each value is 300 bytes. The first data block is 2 × STRING_VIEW_INIT_BLOCK_SIZE // = 16 KiB, so ~50 values saturate it and the rest spill into additional // blocks. let value = "x".repeat(300); From 4ec33f69c789095535b7fcddd0b59e4782492e99 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Sat, 2 May 2026 10:00:24 -0400 Subject: [PATCH 2/3] Add comments, per review --- datafusion/functions/src/string/common.rs | 33 +++++++++++++++++------ 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/datafusion/functions/src/string/common.rs b/datafusion/functions/src/string/common.rs index 6a673d3a22c0d..2732ba4f86ef2 100644 --- a/datafusion/functions/src/string/common.rs +++ b/datafusion/functions/src/string/common.rs @@ -461,10 +461,11 @@ fn case_conversion_utf8view_ascii( } } -/// Walks the views once: inline rows (length ≤ 12) convert their inline bytes -/// in place; long rows copy their referenced bytes into a single packed output -/// buffer while converting, then rewrite the view (`buffer_index = 0`, new -/// offset, new 4-byte prefix) to point at it. +/// Walks the views once and produces a new `StringViewArray` with +/// case-converted bytes. Inline strings (<= 12 bytes) are converted in-place; +/// long strings copy-and-convert into output buffers and have their view fields +/// rewritten to address the new bytes. ASCII case conversion preserves is byte +/// length, so no row migrates between the inline and long layouts. fn case_conversion_utf8view_ascii_inner u8>( array: &StringViewArray, convert: F, @@ -475,16 +476,22 @@ fn case_conversion_utf8view_ascii_inner u8>( let nulls = array.nulls(); let mut new_views: Vec = Vec::with_capacity(item_len); + // Long values are packed into `in_progress`; when full it is sealed into + // `completed` and a new, larger block is started — same block-doubling + // scheme as Arrow's `GenericByteViewBuilder`. let mut in_progress: Vec = Vec::new(); let mut completed: Vec = Vec::new(); let mut block_size: u32 = STRING_VIEW_INIT_BLOCK_SIZE; for i in 0..item_len { if nulls.is_some_and(|n| n.is_null(i)) { + // Zero view = empty, no buffer reference; the null buffer is what + // marks the row null, so the view's value is irrelevant. new_views.push(0); continue; } let view = views[i]; + // Length is the low 32 bits; `as u32` discards the rest of the view. let len = view as u32 as usize; if len == 0 { new_views.push(0); @@ -492,14 +499,18 @@ fn case_conversion_utf8view_ascii_inner u8>( } let mut bytes = view.to_le_bytes(); if len <= 12 { - // Inline row: convert the inline data bytes; layout unchanged. + // Inline: value is in bytes[4..4+len], no buffer reference. Convert + // in place; nothing else in the view needs to change. for b in &mut bytes[4..4 + len] { *b = convert(b); } new_views.push(u128::from_le_bytes(bytes)); } else { - // Make sure the current data block has room for this value; - // otherwise flush and start a new, larger block. + // Long: input view points into shared `data_buffers` we can't + // mutate, so copy-convert into our own buffer and rewrite the + // view's prefix/buffer_index/offset (length is preserved). + + // Ensure the current block has room; otherwise flush and grow. let required_cap = in_progress.len() + len; if in_progress.capacity() < required_cap { if !in_progress.is_empty() { @@ -512,12 +523,16 @@ fn case_conversion_utf8view_ascii_inner u8>( in_progress.reserve(to_reserve); } + // The in-progress block will be sealed at index `completed.len()`, + // and our value starts at the current write position within it. let buffer_index: u32 = i32::try_from(completed.len()) .expect("buffer count exceeds i32::MAX") as u32; let new_offset: u32 = i32::try_from(in_progress.len()).expect("offset exceeds i32::MAX") as u32; + // Source location from the input view: bytes 8..12 are buffer + // index, bytes 12..16 are the offset within it. let src_buffer_index = u32::from_le_bytes(bytes[8..12].try_into().unwrap()) as usize; let src_offset = @@ -528,7 +543,9 @@ fn case_conversion_utf8view_ascii_inner u8>( let prefix_start = in_progress.len(); in_progress.extend(src.iter().map(&convert)); - // Prefix is the first 4 bytes of the converted data we just wrote. + // Rewrite the three long-view fields; bytes[0..4] (length) is + // left untouched. The prefix is read back from the bytes we just + // wrote so the converted value has a single source of truth. let prefix: [u8; 4] = in_progress[prefix_start..prefix_start + 4] .try_into() .unwrap(); From e88e44323dd87ee435a6b192c0ee5bc1297f7691 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Sun, 3 May 2026 12:24:35 -0400 Subject: [PATCH 3/3] Add tests for upper, per review --- datafusion/functions/src/string/upper.rs | 181 +++++++++++++++++++++-- 1 file changed, 172 insertions(+), 9 deletions(-) diff --git a/datafusion/functions/src/string/upper.rs b/datafusion/functions/src/string/upper.rs index a89c3dfb05063..c0ac90b1bc598 100644 --- a/datafusion/functions/src/string/upper.rs +++ b/datafusion/functions/src/string/upper.rs @@ -95,22 +95,24 @@ mod tests { use datafusion_common::config::ConfigOptions; use std::sync::Arc; - fn to_upper(input: ArrayRef, expected: ArrayRef) -> Result<()> { + fn invoke_upper(input: ArrayRef) -> Result { let func = UpperFunc::new(); - - let arg_field = Field::new("a", input.data_type().clone(), true).into(); + let data_type = input.data_type().clone(); let args = ScalarFunctionArgs { number_rows: input.len(), args: vec![ColumnarValue::Array(input)], - arg_fields: vec![arg_field], - return_field: Field::new("f", expected.data_type().clone(), true).into(), + arg_fields: vec![Field::new("a", data_type.clone(), true).into()], + return_field: Field::new("f", data_type, true).into(), config_options: Arc::new(ConfigOptions::default()), }; - - let result = match func.invoke_with_args(args)? { - ColumnarValue::Array(result) => result, + match func.invoke_with_args(args)? { + ColumnarValue::Array(r) => Ok(r), _ => unreachable!("upper"), - }; + } + } + + fn to_upper(input: ArrayRef, expected: ArrayRef) -> Result<()> { + let result = invoke_upper(input)?; assert_eq!(&expected, &result); Ok(()) } @@ -206,4 +208,165 @@ mod tests { to_upper(input, expected) } + + #[test] + fn upper_ascii_utf8view() -> Result<()> { + // Mix of inlined (≤12 bytes) and referenced (>12 bytes) strings, plus + // a null and an empty, to exercise the all-ASCII Utf8View fast path. + let input = Arc::new(StringViewArray::from(vec![ + Some("arrow"), // inlined short + None, + Some("hello world 123"), // referenced (15 bytes) + Some(""), + Some("0123456789"), // inlined, no case change + Some("datafusion is cool"), // referenced + ])) as ArrayRef; + + let expected = Arc::new(StringViewArray::from(vec![ + Some("ARROW"), + None, + Some("HELLO WORLD 123"), + Some(""), + Some("0123456789"), + Some("DATAFUSION IS COOL"), + ])) as ArrayRef; + + to_upper(input, expected) + } + + #[test] + fn upper_sliced_ascii_utf8view() -> Result<()> { + // Slice of a parent that contains a non-ASCII string outside the + // slice. The slice is all-ASCII, so the fast path must run and produce + // correct output while the parent's unaddressed non-ASCII bytes are + // irrelevant to the result. + let parent = Arc::new(StringViewArray::from(vec![ + Some("农历新年long enough for buffer"), + Some("hello world 123"), + Some("datafusion rocks!"), + Some("zzzzzzzzzzzzzzzz"), + ])) as ArrayRef; + let sliced = parent.slice(1, 2); + let result = invoke_upper(sliced)?; + let result_sv = result.as_any().downcast_ref::().unwrap(); + + let expected = StringViewArray::from(vec![ + Some("HELLO WORLD 123"), + Some("DATAFUSION ROCKS!"), + ]); + assert_eq!(result_sv, &expected); + // The slice's two long views address 15 + 17 = 32 bytes; the ASCII + // fast path must produce a single packed buffer of exactly that + // size, not one scaled to the parent's data buffer. + assert_eq!(result_sv.data_buffers().len(), 1); + assert_eq!(result_sv.data_buffers()[0].len(), 32); + Ok(()) + } + + #[test] + fn upper_utf8view_inline_only_no_buffers() -> Result<()> { + // An array whose values are all ≤ 12 bytes is fully inline; the ASCII + // fast path should produce no data buffers at all. + let input = Arc::new(StringViewArray::from(vec![ + Some("hello"), + None, + Some(""), + Some("0123456789AB"), // 12 bytes — inline boundary + ])) as ArrayRef; + let result = invoke_upper(input)?; + let result_sv = result.as_any().downcast_ref::().unwrap(); + + let expected = StringViewArray::from(vec![ + Some("HELLO"), + None, + Some(""), + Some("0123456789AB"), + ]); + assert_eq!(result_sv, &expected); + assert_eq!( + result_sv.data_buffers().len(), + 0, + "inline-only Utf8View should produce no data buffers" + ); + Ok(()) + } + + #[test] + fn upper_utf8view_long_packs_tight() -> Result<()> { + // Mix of long and inline values; the long values should be packed into + // a single tight output buffer whose size is exactly the sum of their + // lengths (inline values do not contribute). + let input = Arc::new(StringViewArray::from(vec![ + Some("hello world 123"), // 15 bytes (long) + Some("abc"), // inline + None, + Some("datafusion rocks!"), // 17 bytes (long) + Some("another long string"), // 19 bytes (long) + ])) as ArrayRef; + let result = invoke_upper(input)?; + let result_sv = result.as_any().downcast_ref::().unwrap(); + + let expected = StringViewArray::from(vec![ + Some("HELLO WORLD 123"), + Some("ABC"), + None, + Some("DATAFUSION ROCKS!"), + Some("ANOTHER LONG STRING"), + ]); + assert_eq!(result_sv, &expected); + assert_eq!(result_sv.data_buffers().len(), 1); + assert_eq!(result_sv.data_buffers()[0].len(), 15 + 17 + 19); + Ok(()) + } + + #[test] + fn upper_utf8view_splits_into_multiple_buffers() -> Result<()> { + // Produce enough long-string output to overflow the first data block + // (≈16 KiB after the initial doubling) and confirm the fast path + // splits across buffers rather than packing everything into one and + // risking the i32::MAX offset limit. + const STR_LEN: usize = 500; + const N: usize = 40; // 40 × 500 B = 20 KiB total — crosses the first block. + let value = "x".repeat(STR_LEN); + let inputs: Vec> = (0..N).map(|_| Some(value.clone())).collect(); + let input = Arc::new(StringViewArray::from(inputs.clone())) as ArrayRef; + let result = invoke_upper(input)?; + let result_sv = result.as_any().downcast_ref::().unwrap(); + + let expected_value = "X".repeat(STR_LEN); + let expected: Vec> = + (0..N).map(|_| Some(expected_value.as_str())).collect(); + assert_eq!(result_sv, &StringViewArray::from(expected)); + assert!( + result_sv.data_buffers().len() >= 2, + "expected the output to span more than one data buffer, got {}", + result_sv.data_buffers().len() + ); + // Total bytes across buffers must equal total long-value bytes + // (no row was inlined since each value is > 12 bytes). + let total: usize = result_sv.data_buffers().iter().map(|b| b.len()).sum(); + assert_eq!(total, N * STR_LEN); + Ok(()) + } + + #[test] + fn upper_sliced_utf8() -> Result<()> { + let parent = Arc::new(StringArray::from(vec![ + Some("aaaaaaaa"), + Some("hello"), + Some("world"), + Some(""), + Some("zzzzzzzz"), + ])) as ArrayRef; + let sliced = parent.slice(1, 3); + let result = invoke_upper(sliced)?; + let result_sa = result.as_any().downcast_ref::().unwrap(); + + let expected = StringArray::from(vec![Some("HELLO"), Some("WORLD"), Some("")]); + assert_eq!(result_sa, &expected); + // The slice's addressed bytes are "hello" + "world" = 10; the ASCII + // fast path must produce a tight output buffer (not the parent's). + assert_eq!(result_sa.value_data().len(), 10); + Ok(()) + } }