From e6c07fc05f793b7d972b0d67c56e1cb8782164b7 Mon Sep 17 00:00:00 2001
From: Neil Conway <neil.conway@gmail.com>
Date: Sat, 25 Apr 2026 08:56:21 -0400
Subject: [PATCH 1/3] .

---
 datafusion/functions/src/string/common.rs | 210 +++++++++++++++++-----
 datafusion/functions/src/string/lower.rs  | 140 +++++++++++++++
 datafusion/functions/src/strings.rs       |  25 ++-
 3 files changed, 314 insertions(+), 61 deletions(-)
diff --git a/datafusion/functions/src/string/common.rs b/datafusion/functions/src/string/common.rs
index 70699c3653c4b..6a673d3a22c0d 100644
--- a/datafusion/functions/src/string/common.rs
+++ b/datafusion/functions/src/string/common.rs
@@ -19,7 +19,10 @@
 
 use std::sync::Arc;
 
-use crate::strings::{GenericStringArrayBuilder, StringViewArrayBuilder, append_view};
+use crate::strings::{
+    GenericStringArrayBuilder, STRING_VIEW_INIT_BLOCK_SIZE, STRING_VIEW_MAX_BLOCK_SIZE,
+    StringViewArrayBuilder, append_view,
+};
 use arrow::array::{
     Array, ArrayRef, GenericStringArray, NullBufferBuilder, OffsetSizeTrait,
     StringViewArray, new_null_array,
@@ -323,32 +326,42 @@ fn string_trim<T: OffsetSizeTrait, Tr: Trimmer>(args: &[ArrayRef]) -> Result<Arr
 }
 
 pub(crate) fn to_lower(args: &[ColumnarValue], name: &str) -> Result<ColumnarValue> {
-    case_conversion(args, |string| string.to_lowercase(), name)
+    case_conversion(args, true, name)
 }
 
 pub(crate) fn to_upper(args: &[ColumnarValue], name: &str) -> Result<ColumnarValue> {
-    case_conversion(args, |string| string.to_uppercase(), name)
+    case_conversion(args, false, name)
+}
+
+#[inline]
+fn unicode_case(s: &str, lower: bool) -> String {
+    if lower {
+        s.to_lowercase()
+    } else {
+        s.to_uppercase()
+    }
 }
 
-fn case_conversion<'a, F>(
-    args: &'a [ColumnarValue],
-    op: F,
+fn case_conversion(
+    args: &[ColumnarValue],
+    lower: bool,
     name: &str,
-) -> Result<ColumnarValue>
-where
-    F: Fn(&'a str) -> String,
-{
+) -> Result<ColumnarValue> {
     match &args[0] {
         ColumnarValue::Array(array) => match array.data_type() {
-            DataType::Utf8 => Ok(ColumnarValue::Array(case_conversion_array::<i32, _>(
-                array, op,
+            DataType::Utf8 => Ok(ColumnarValue::Array(case_conversion_array::<i32>(
+                array, lower,
             )?)),
-            DataType::LargeUtf8 => Ok(ColumnarValue::Array(case_conversion_array::<
-                i64,
-                _,
-            >(array, op)?)),
+            DataType::LargeUtf8 => Ok(ColumnarValue::Array(
+                case_conversion_array::<i64>(array, lower)?,
+            )),
             DataType::Utf8View => {
                 let string_array = as_string_view_array(array)?;
+                if string_array.is_ascii() {
+                    return Ok(ColumnarValue::Array(Arc::new(
+                        case_conversion_utf8view_ascii(string_array, lower),
+                    )));
+                }
                 let item_len = string_array.len();
                 // Null-preserving: reuse the input null buffer as the output null buffer.
                 let nulls = string_array.nulls().cloned();
@@ -361,14 +374,14 @@ where
                         } else {
                             // SAFETY: `n.is_null(i)` was false in the branch above.
                             let s = unsafe { string_array.value_unchecked(i) };
-                            builder.append_value(&op(s));
+                            builder.append_value(&unicode_case(s, lower));
                         }
                     }
                 } else {
                     for i in 0..item_len {
                         // SAFETY: no null buffer means every index is valid.
                         let s = unsafe { string_array.value_unchecked(i) };
-                        builder.append_value(&op(s));
+                        builder.append_value(&unicode_case(s, lower));
                     }
                 }
 
@@ -378,15 +391,15 @@ where
         },
         ColumnarValue::Scalar(scalar) => match scalar {
             ScalarValue::Utf8(a) => {
-                let result = a.as_ref().map(|x| op(x));
+                let result = a.as_ref().map(|x| unicode_case(x, lower));
                 Ok(ColumnarValue::Scalar(ScalarValue::Utf8(result)))
             }
             ScalarValue::LargeUtf8(a) => {
-                let result = a.as_ref().map(|x| op(x));
+                let result = a.as_ref().map(|x| unicode_case(x, lower));
                 Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(result)))
             }
             ScalarValue::Utf8View(a) => {
-                let result = a.as_ref().map(|x| op(x));
+                let result = a.as_ref().map(|x| unicode_case(x, lower));
                 Ok(ColumnarValue::Scalar(ScalarValue::Utf8View(result)))
             }
             other => exec_err!("Unsupported data type {other:?} for function {name}"),
@@ -394,16 +407,15 @@ where
     }
 }
 
-fn case_conversion_array<'a, O, F>(array: &'a ArrayRef, op: F) -> Result<ArrayRef>
-where
-    O: OffsetSizeTrait,
-    F: Fn(&'a str) -> String,
-{
+fn case_conversion_array<O: OffsetSizeTrait>(
+    array: &ArrayRef,
+    lower: bool,
+) -> Result<ArrayRef> {
     const PRE_ALLOC_BYTES: usize = 8;
 
     let string_array = as_generic_string_array::<O>(array)?;
     if string_array.is_ascii() {
-        return case_conversion_ascii_array::<O, _>(string_array, op);
+        return case_conversion_ascii_array::<O>(string_array, lower);
     }
 
     // Values contain non-ASCII.
@@ -423,43 +435,147 @@ where
             } else {
                 // SAFETY: `n.is_null(i)` was false in the branch above.
                 let s = unsafe { string_array.value_unchecked(i) };
-                builder.append_value(&op(s));
+                builder.append_value(&unicode_case(s, lower));
             }
         }
     } else {
         for i in 0..item_len {
             // SAFETY: no null buffer means every index is valid.
             let s = unsafe { string_array.value_unchecked(i) };
-            builder.append_value(&op(s));
+            builder.append_value(&unicode_case(s, lower));
         }
     }
     Ok(Arc::new(builder.finish(nulls)?))
 }
 
+/// Fast path for case conversion on an all-ASCII `StringViewArray`.
+fn case_conversion_utf8view_ascii(
+    array: &StringViewArray,
+    lower: bool,
+) -> StringViewArray {
+    // Specialize per conversion so the byte call inlines in the hot loops below.
+    if lower {
+        case_conversion_utf8view_ascii_inner(array, u8::to_ascii_lowercase)
+    } else {
+        case_conversion_utf8view_ascii_inner(array, u8::to_ascii_uppercase)
+    }
+}
+
+/// Walks the views once: inline rows (length ≤ 12) convert their inline bytes
+/// in place; long rows copy their referenced bytes into a single packed output
+/// buffer while converting, then rewrite the view (`buffer_index = 0`, new
+/// offset, new 4-byte prefix) to point at it.
+fn case_conversion_utf8view_ascii_inner<F: Fn(&u8) -> u8>(
+    array: &StringViewArray,
+    convert: F,
+) -> StringViewArray {
+    let item_len = array.len();
+    let views = array.views();
+    let data_buffers = array.data_buffers();
+    let nulls = array.nulls();
+
+    let mut new_views: Vec<u128> = Vec::with_capacity(item_len);
+    let mut in_progress: Vec<u8> = Vec::new();
+    let mut completed: Vec<Buffer> = Vec::new();
+    let mut block_size: u32 = STRING_VIEW_INIT_BLOCK_SIZE;
+
+    for i in 0..item_len {
+        if nulls.is_some_and(|n| n.is_null(i)) {
+            new_views.push(0);
+            continue;
+        }
+        let view = views[i];
+        let len = view as u32 as usize;
+        if len == 0 {
+            new_views.push(0);
+            continue;
+        }
+        let mut bytes = view.to_le_bytes();
+        if len <= 12 {
+            // Inline row: convert the inline data bytes; layout unchanged.
+            for b in &mut bytes[4..4 + len] {
+                *b = convert(b);
+            }
+            new_views.push(u128::from_le_bytes(bytes));
+        } else {
+            // Make sure the current data block has room for this value;
+            // otherwise flush and start a new, larger block.
+            let required_cap = in_progress.len() + len;
+            if in_progress.capacity() < required_cap {
+                if !in_progress.is_empty() {
+                    completed.push(Buffer::from_vec(std::mem::take(&mut in_progress)));
+                }
+                if block_size < STRING_VIEW_MAX_BLOCK_SIZE {
+                    block_size = block_size.saturating_mul(2);
+                }
+                let to_reserve = len.max(block_size as usize);
+                in_progress.reserve(to_reserve);
+            }
+
+            let buffer_index: u32 = i32::try_from(completed.len())
+                .expect("buffer count exceeds i32::MAX")
+                as u32;
+            let new_offset: u32 =
+                i32::try_from(in_progress.len()).expect("offset exceeds i32::MAX") as u32;
+
+            let src_buffer_index =
+                u32::from_le_bytes(bytes[8..12].try_into().unwrap()) as usize;
+            let src_offset =
+                u32::from_le_bytes(bytes[12..16].try_into().unwrap()) as usize;
+            let src =
+                &data_buffers[src_buffer_index].as_slice()[src_offset..src_offset + len];
+
+            let prefix_start = in_progress.len();
+            in_progress.extend(src.iter().map(&convert));
+
+            // Prefix is the first 4 bytes of the converted data we just wrote.
+            let prefix: [u8; 4] = in_progress[prefix_start..prefix_start + 4]
+                .try_into()
+                .unwrap();
+            bytes[4..8].copy_from_slice(&prefix);
+            bytes[8..12].copy_from_slice(&buffer_index.to_le_bytes());
+            bytes[12..16].copy_from_slice(&new_offset.to_le_bytes());
+            new_views.push(u128::from_le_bytes(bytes));
+        }
+    }
+
+    if !in_progress.is_empty() {
+        completed.push(Buffer::from_vec(in_progress));
+    }
+
+    // SAFETY: each long view's buffer_index addresses a buffer we wrote, and
+    // its offset addresses bytes within that buffer; prefixes were copied from
+    // those same bytes; inline views were rewritten from valid inline bytes;
+    // null/empty rows are zero views with no buffer reference; row count is
+    // unchanged.
+    unsafe {
+        StringViewArray::new_unchecked(
+            ScalarBuffer::from(new_views),
+            completed,
+            array.nulls().cloned(),
+        )
+    }
+}
+
 /// Fast path for case conversion on an all-ASCII string array. ASCII case
 /// conversion is byte-length-preserving, so we can convert the entire addressed
-/// range in one call and reuse the offsets and nulls buffers — rebasing the
-/// offsets when the input is a sliced array.
-fn case_conversion_ascii_array<'a, O, F>(
-    string_array: &'a GenericStringArray<O>,
-    op: F,
-) -> Result<ArrayRef>
-where
-    O: OffsetSizeTrait,
-    F: Fn(&'a str) -> String,
-{
+/// byte range in one pass over the value buffer and reuse the offsets and nulls
+/// buffers — rebasing the offsets when the input is a sliced array.
+fn case_conversion_ascii_array<O: OffsetSizeTrait>(
+    string_array: &GenericStringArray<O>,
+    lower: bool,
+) -> Result<ArrayRef> {
     let value_offsets = string_array.value_offsets();
     let start = value_offsets.first().unwrap().as_usize();
     let end = value_offsets.last().unwrap().as_usize();
     let relevant = &string_array.value_data()[start..end];
 
-    // SAFETY: `relevant` is a subslice of the string array's value buffer,
-    // which is valid UTF-8.
-    let str_values = unsafe { std::str::from_utf8_unchecked(relevant) };
-
-    let converted_values = op(str_values);
-    debug_assert_eq!(converted_values.len(), str_values.len());
-    let values = Buffer::from_vec(converted_values.into_bytes());
+    let converted: Vec<u8> = if lower {
+        relevant.iter().map(u8::to_ascii_lowercase).collect()
+    } else {
+        relevant.iter().map(u8::to_ascii_uppercase).collect()
+    };
+    let values = Buffer::from_vec(converted);
 
     // Shift offsets from `start`-based to 0-based so they index into `values`.
     let offsets = if start == 0 {
@@ -468,7 +584,7 @@ where
         let s = O::usize_as(start);
         let rebased: Vec<O> = value_offsets.iter().map(|&o| o - s).collect();
         // SAFETY: subtracting a constant from monotonic offsets preserves
-        // monotonicity, and `start` is the minimum offset so no underflow.
+        // monotonicity, and `start` is the minimum offset, so no underflow.
         unsafe { OffsetBuffer::new_unchecked(ScalarBuffer::from(rebased)) }
     };
 
diff --git a/datafusion/functions/src/string/lower.rs b/datafusion/functions/src/string/lower.rs
index 0b0476d0ec712..57cbe1d8779f0 100644
--- a/datafusion/functions/src/string/lower.rs
+++ b/datafusion/functions/src/string/lower.rs
@@ -210,6 +210,146 @@ mod tests {
         to_lower(input, expected)
     }
 
+    #[test]
+    fn lower_ascii_utf8view() -> Result<()> {
+        // Mix of inlined (≤12 bytes) and referenced (>12 bytes) strings, plus
+        // a null and an empty, to exercise the all-ASCII Utf8View fast path.
+        let input = Arc::new(StringViewArray::from(vec![
+            Some("ARROW"), // inlined short
+            None,
+            Some("HELLO WORLD 123"), // referenced (15 bytes)
+            Some(""),
+            Some("0123456789"),         // inlined, no case change
+            Some("DATAFUSION IS COOL"), // referenced
+        ])) as ArrayRef;
+
+        let expected = Arc::new(StringViewArray::from(vec![
+            Some("arrow"),
+            None,
+            Some("hello world 123"),
+            Some(""),
+            Some("0123456789"),
+            Some("datafusion is cool"),
+        ])) as ArrayRef;
+
+        to_lower(input, expected)
+    }
+
+    #[test]
+    fn lower_sliced_ascii_utf8view() -> Result<()> {
+        // Slice of a parent that contains a non-ASCII string outside the
+        // slice. The slice is all-ASCII, so the fast path must run and produce
+        // correct output while the parent's unaddressed non-ASCII bytes are
+        // irrelevant to the result.
+        let parent = Arc::new(StringViewArray::from(vec![
+            Some("农历新年LONG ENOUGH FOR BUFFER"),
+            Some("HELLO WORLD 123"),
+            Some("DATAFUSION ROCKS!"),
+            Some("ZZZZZZZZZZZZZZZZ"),
+        ])) as ArrayRef;
+        let sliced = parent.slice(1, 2);
+        let result = invoke_lower(sliced)?;
+        let result_sv = result.as_any().downcast_ref::<StringViewArray>().unwrap();
+
+        let expected = StringViewArray::from(vec![
+            Some("hello world 123"),
+            Some("datafusion rocks!"),
+        ]);
+        assert_eq!(result_sv, &expected);
+        // The slice's two long views address 15 + 17 = 32 bytes; the ASCII
+        // fast path must produce a single packed buffer of exactly that
+        // size, not one scaled to the parent's data buffer.
+        assert_eq!(result_sv.data_buffers().len(), 1);
+        assert_eq!(result_sv.data_buffers()[0].len(), 32);
+        Ok(())
+    }
+
+    #[test]
+    fn lower_utf8view_inline_only_no_buffers() -> Result<()> {
+        // An array whose values are all ≤ 12 bytes is fully inline; the ASCII
+        // fast path should produce no data buffers at all.
+        let input = Arc::new(StringViewArray::from(vec![
+            Some("HELLO"),
+            None,
+            Some(""),
+            Some("0123456789ab"), // 12 bytes — inline boundary
+        ])) as ArrayRef;
+        let result = invoke_lower(input)?;
+        let result_sv = result.as_any().downcast_ref::<StringViewArray>().unwrap();
+
+        let expected = StringViewArray::from(vec![
+            Some("hello"),
+            None,
+            Some(""),
+            Some("0123456789ab"),
+        ]);
+        assert_eq!(result_sv, &expected);
+        assert_eq!(
+            result_sv.data_buffers().len(),
+            0,
+            "inline-only Utf8View should produce no data buffers"
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn lower_utf8view_long_packs_tight() -> Result<()> {
+        // Mix of long and inline values; the long values should be packed into
+        // a single tight output buffer whose size is exactly the sum of their
+        // lengths (inline values do not contribute).
+        let input = Arc::new(StringViewArray::from(vec![
+            Some("HELLO WORLD 123"), // 15 bytes (long)
+            Some("ABC"),             // inline
+            None,
+            Some("DATAFUSION ROCKS!"),   // 17 bytes (long)
+            Some("ANOTHER LONG STRING"), // 19 bytes (long)
+        ])) as ArrayRef;
+        let result = invoke_lower(input)?;
+        let result_sv = result.as_any().downcast_ref::<StringViewArray>().unwrap();
+
+        let expected = StringViewArray::from(vec![
+            Some("hello world 123"),
+            Some("abc"),
+            None,
+            Some("datafusion rocks!"),
+            Some("another long string"),
+        ]);
+        assert_eq!(result_sv, &expected);
+        assert_eq!(result_sv.data_buffers().len(), 1);
+        assert_eq!(result_sv.data_buffers()[0].len(), 15 + 17 + 19);
+        Ok(())
+    }
+
+    #[test]
+    fn lower_utf8view_splits_into_multiple_buffers() -> Result<()> {
+        // Produce enough long-string output to overflow the first data block
+        // (≈16 KiB after the initial doubling) and confirm the fast path
+        // splits across buffers rather than packing everything into one and
+        // risking the i32::MAX offset limit.
+        const STR_LEN: usize = 500;
+        const N: usize = 40; // 40 × 500 B = 20 KiB total — crosses the first block.
+        let value = "X".repeat(STR_LEN);
+        let inputs: Vec<Option<String>> = (0..N).map(|_| Some(value.clone())).collect();
+        let input = Arc::new(StringViewArray::from(inputs.clone())) as ArrayRef;
+        let result = invoke_lower(input)?;
+        let result_sv = result.as_any().downcast_ref::<StringViewArray>().unwrap();
+
+        let expected_value = "x".repeat(STR_LEN);
+        let expected: Vec<Option<&str>> =
+            (0..N).map(|_| Some(expected_value.as_str())).collect();
+        assert_eq!(result_sv, &StringViewArray::from(expected));
+        assert!(
+            result_sv.data_buffers().len() >= 2,
+            "expected the output to span more than one data buffer, got {}",
+            result_sv.data_buffers().len()
+        );
+        // Total bytes across buffers must equal total long-value bytes
+        // (no row was inlined since each value is > 12 bytes).
+        let total: usize = result_sv.data_buffers().iter().map(|b| b.len()).sum();
+        assert_eq!(total, N * STR_LEN);
+        Ok(())
+    }
+
     #[test]
     fn lower_sliced_utf8() -> Result<()> {
         let parent = Arc::new(StringArray::from(vec![
diff --git a/datafusion/functions/src/strings.rs b/datafusion/functions/src/strings.rs
index 8d9109bc2ab60..872539854a9eb 100644
--- a/datafusion/functions/src/strings.rs
+++ b/datafusion/functions/src/strings.rs
@@ -531,12 +531,12 @@ impl<O: OffsetSizeTrait> GenericStringArrayBuilder<O> {
     }
 }
 
-/// Starting size for the long-string data block; matches Arrow's
-/// `GenericByteViewBuilder` default.
-const STARTING_BLOCK_SIZE: u32 = 8 * 1024;
-/// Maximum size each long-string data block grows to; matches Arrow's
-/// `GenericByteViewBuilder` default.
-const MAX_BLOCK_SIZE: u32 = 2 * 1024 * 1024;
+/// Starting size for the long-string data block used by `StringView`-style
+/// arrays; matches Arrow's `GenericByteViewBuilder` default.
+pub(crate) const STRING_VIEW_INIT_BLOCK_SIZE: u32 = 8 * 1024;
+/// Maximum size each long-string data block in a `StringView`-style array
+/// grows to; matches Arrow's `GenericByteViewBuilder` default.
+pub(crate) const STRING_VIEW_MAX_BLOCK_SIZE: u32 = 2 * 1024 * 1024;
 
 /// Builder for a [`StringViewArray`] that defers null tracking to `finish`.
 ///
@@ -545,13 +545,11 @@ const MAX_BLOCK_SIZE: u32 = 2 * 1024 * 1024;
 /// Short strings (≤ 12 bytes) are inlined into the view itself; long strings
 /// are appended into an in-progress data block. When the in-progress block
 /// fills up it is flushed into `completed` and a new block — double the size
-/// of the last, capped at [`MAX_BLOCK_SIZE`] — is started.
+/// of the last, capped at [`STRING_VIEW_MAX_BLOCK_SIZE`] — is started.
 pub(crate) struct StringViewArrayBuilder {
     views: Vec<u128>,
     in_progress: Vec<u8>,
     completed: Vec<Buffer>,
-    /// Current block-size target; doubles each time a block is flushed, up to
-    /// [`MAX_BLOCK_SIZE`].
     block_size: u32,
     placeholder_count: usize,
 }
@@ -562,15 +560,14 @@ impl StringViewArrayBuilder {
             views: Vec::with_capacity(item_capacity),
             in_progress: Vec::new(),
             completed: Vec::new(),
-            block_size: STARTING_BLOCK_SIZE,
+            block_size: STRING_VIEW_INIT_BLOCK_SIZE,
             placeholder_count: 0,
         }
     }
 
-    /// Doubles the block-size target (capped at [`MAX_BLOCK_SIZE`]) and
-    /// returns the new size. The first call returns `2 * STARTING_BLOCK_SIZE`.
+    /// Doubles the block-size target and returns the new size.
     fn next_block_size(&mut self) -> u32 {
-        if self.block_size < MAX_BLOCK_SIZE {
+        if self.block_size < STRING_VIEW_MAX_BLOCK_SIZE {
             self.block_size = self.block_size.saturating_mul(2);
         }
         self.block_size
@@ -988,7 +985,7 @@ mod tests {
 
     #[test]
     fn string_view_array_builder_flushes_full_blocks() {
-        // Each value is 300 bytes. The first data block is 2 × STARTING_BLOCK_SIZE
+        // Each value is 300 bytes. The first data block is 2 × STRING_VIEW_INIT_BLOCK_SIZE
         // = 16 KiB, so ~50 values saturate it and the rest spill into additional
         // blocks.
         let value = "x".repeat(300);

From 4ec33f69c789095535b7fcddd0b59e4782492e99 Mon Sep 17 00:00:00 2001
From: Neil Conway <neil.conway@gmail.com>
Date: Sat, 2 May 2026 10:00:24 -0400
Subject: [PATCH 2/3] Add comments, per review

---
 datafusion/functions/src/string/common.rs | 33 +++++++++++++++++------
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/datafusion/functions/src/string/common.rs b/datafusion/functions/src/string/common.rs
index 6a673d3a22c0d..2732ba4f86ef2 100644
--- a/datafusion/functions/src/string/common.rs
+++ b/datafusion/functions/src/string/common.rs
@@ -461,10 +461,11 @@ fn case_conversion_utf8view_ascii(
     }
 }
 
-/// Walks the views once: inline rows (length ≤ 12) convert their inline bytes
-/// in place; long rows copy their referenced bytes into a single packed output
-/// buffer while converting, then rewrite the view (`buffer_index = 0`, new
-/// offset, new 4-byte prefix) to point at it.
+/// Walks the views once and produces a new `StringViewArray` with
+/// case-converted bytes. Inline strings (<= 12 bytes) are converted in-place;
+/// long strings copy-and-convert into output buffers and have their view fields
+/// rewritten to address the new bytes. ASCII case conversion preserves is byte
+/// length, so no row migrates between the inline and long layouts.
 fn case_conversion_utf8view_ascii_inner<F: Fn(&u8) -> u8>(
     array: &StringViewArray,
     convert: F,
@@ -475,16 +476,22 @@ fn case_conversion_utf8view_ascii_inner<F: Fn(&u8) -> u8>(
     let nulls = array.nulls();
 
     let mut new_views: Vec<u128> = Vec::with_capacity(item_len);
+    // Long values are packed into `in_progress`; when full it is sealed into
+    // `completed` and a new, larger block is started — same block-doubling
+    // scheme as Arrow's `GenericByteViewBuilder`.
     let mut in_progress: Vec<u8> = Vec::new();
     let mut completed: Vec<Buffer> = Vec::new();
     let mut block_size: u32 = STRING_VIEW_INIT_BLOCK_SIZE;
 
     for i in 0..item_len {
         if nulls.is_some_and(|n| n.is_null(i)) {
+            // Zero view = empty, no buffer reference; the null buffer is what
+            // marks the row null, so the view's value is irrelevant.
             new_views.push(0);
             continue;
         }
         let view = views[i];
+        // Length is the low 32 bits; `as u32` discards the rest of the view.
         let len = view as u32 as usize;
         if len == 0 {
             new_views.push(0);
@@ -492,14 +499,18 @@ fn case_conversion_utf8view_ascii_inner<F: Fn(&u8) -> u8>(
         }
         let mut bytes = view.to_le_bytes();
         if len <= 12 {
-            // Inline row: convert the inline data bytes; layout unchanged.
+            // Inline: value is in bytes[4..4+len], no buffer reference. Convert
+            // in place; nothing else in the view needs to change.
             for b in &mut bytes[4..4 + len] {
                 *b = convert(b);
             }
             new_views.push(u128::from_le_bytes(bytes));
         } else {
-            // Make sure the current data block has room for this value;
-            // otherwise flush and start a new, larger block.
+            // Long: input view points into shared `data_buffers` we can't
+            // mutate, so copy-convert into our own buffer and rewrite the
+            // view's prefix/buffer_index/offset (length is preserved).
+
+            // Ensure the current block has room; otherwise flush and grow.
             let required_cap = in_progress.len() + len;
             if in_progress.capacity() < required_cap {
                 if !in_progress.is_empty() {
@@ -512,12 +523,16 @@ fn case_conversion_utf8view_ascii_inner<F: Fn(&u8) -> u8>(
                 in_progress.reserve(to_reserve);
             }
 
+            // The in-progress block will be sealed at index `completed.len()`,
+            // and our value starts at the current write position within it.
             let buffer_index: u32 = i32::try_from(completed.len())
                 .expect("buffer count exceeds i32::MAX")
                 as u32;
             let new_offset: u32 =
                 i32::try_from(in_progress.len()).expect("offset exceeds i32::MAX") as u32;
 
+            // Source location from the input view: bytes 8..12 are buffer
+            // index, bytes 12..16 are the offset within it.
             let src_buffer_index =
                 u32::from_le_bytes(bytes[8..12].try_into().unwrap()) as usize;
             let src_offset =
@@ -528,7 +543,9 @@ fn case_conversion_utf8view_ascii_inner<F: Fn(&u8) -> u8>(
             let prefix_start = in_progress.len();
             in_progress.extend(src.iter().map(&convert));
 
-            // Prefix is the first 4 bytes of the converted data we just wrote.
+            // Rewrite the three long-view fields; bytes[0..4] (length) is
+            // left untouched. The prefix is read back from the bytes we just
+            // wrote so the converted value has a single source of truth.
             let prefix: [u8; 4] = in_progress[prefix_start..prefix_start + 4]
                 .try_into()
                 .unwrap();

From e88e44323dd87ee435a6b192c0ee5bc1297f7691 Mon Sep 17 00:00:00 2001
From: Neil Conway <neil.conway@gmail.com>
Date: Sun, 3 May 2026 12:24:35 -0400
Subject: [PATCH 3/3] Add tests for upper, per review

---
 datafusion/functions/src/string/upper.rs | 181 +++++++++++++++++++++--
 1 file changed, 172 insertions(+), 9 deletions(-)

diff --git a/datafusion/functions/src/string/upper.rs b/datafusion/functions/src/string/upper.rs
index a89c3dfb05063..c0ac90b1bc598 100644
--- a/datafusion/functions/src/string/upper.rs
+++ b/datafusion/functions/src/string/upper.rs
@@ -95,22 +95,24 @@ mod tests {
     use datafusion_common::config::ConfigOptions;
     use std::sync::Arc;
 
-    fn to_upper(input: ArrayRef, expected: ArrayRef) -> Result<()> {
+    fn invoke_upper(input: ArrayRef) -> Result<ArrayRef> {
         let func = UpperFunc::new();
-
-        let arg_field = Field::new("a", input.data_type().clone(), true).into();
+        let data_type = input.data_type().clone();
         let args = ScalarFunctionArgs {
             number_rows: input.len(),
             args: vec![ColumnarValue::Array(input)],
-            arg_fields: vec![arg_field],
-            return_field: Field::new("f", expected.data_type().clone(), true).into(),
+            arg_fields: vec![Field::new("a", data_type.clone(), true).into()],
+            return_field: Field::new("f", data_type, true).into(),
             config_options: Arc::new(ConfigOptions::default()),
         };
-
-        let result = match func.invoke_with_args(args)? {
-            ColumnarValue::Array(result) => result,
+        match func.invoke_with_args(args)? {
+            ColumnarValue::Array(r) => Ok(r),
             _ => unreachable!("upper"),
-        };
+        }
+    }
+
+    fn to_upper(input: ArrayRef, expected: ArrayRef) -> Result<()> {
+        let result = invoke_upper(input)?;
         assert_eq!(&expected, &result);
         Ok(())
     }
@@ -206,4 +208,165 @@ mod tests {
 
         to_upper(input, expected)
     }
+
+    #[test]
+    fn upper_ascii_utf8view() -> Result<()> {
+        // Mix of inlined (≤12 bytes) and referenced (>12 bytes) strings, plus
+        // a null and an empty, to exercise the all-ASCII Utf8View fast path.
+        let input = Arc::new(StringViewArray::from(vec![
+            Some("arrow"), // inlined short
+            None,
+            Some("hello world 123"), // referenced (15 bytes)
+            Some(""),
+            Some("0123456789"),         // inlined, no case change
+            Some("datafusion is cool"), // referenced
+        ])) as ArrayRef;
+
+        let expected = Arc::new(StringViewArray::from(vec![
+            Some("ARROW"),
+            None,
+            Some("HELLO WORLD 123"),
+            Some(""),
+            Some("0123456789"),
+            Some("DATAFUSION IS COOL"),
+        ])) as ArrayRef;
+
+        to_upper(input, expected)
+    }
+
+    #[test]
+    fn upper_sliced_ascii_utf8view() -> Result<()> {
+        // Slice of a parent that contains a non-ASCII string outside the
+        // slice. The slice is all-ASCII, so the fast path must run and produce
+        // correct output while the parent's unaddressed non-ASCII bytes are
+        // irrelevant to the result.
+        let parent = Arc::new(StringViewArray::from(vec![
+            Some("农历新年long enough for buffer"),
+            Some("hello world 123"),
+            Some("datafusion rocks!"),
+            Some("zzzzzzzzzzzzzzzz"),
+        ])) as ArrayRef;
+        let sliced = parent.slice(1, 2);
+        let result = invoke_upper(sliced)?;
+        let result_sv = result.as_any().downcast_ref::<StringViewArray>().unwrap();
+
+        let expected = StringViewArray::from(vec![
+            Some("HELLO WORLD 123"),
+            Some("DATAFUSION ROCKS!"),
+        ]);
+        assert_eq!(result_sv, &expected);
+        // The slice's two long views address 15 + 17 = 32 bytes; the ASCII
+        // fast path must produce a single packed buffer of exactly that
+        // size, not one scaled to the parent's data buffer.
+        assert_eq!(result_sv.data_buffers().len(), 1);
+        assert_eq!(result_sv.data_buffers()[0].len(), 32);
+        Ok(())
+    }
+
+    #[test]
+    fn upper_utf8view_inline_only_no_buffers() -> Result<()> {
+        // An array whose values are all ≤ 12 bytes is fully inline; the ASCII
+        // fast path should produce no data buffers at all.
+        let input = Arc::new(StringViewArray::from(vec![
+            Some("hello"),
+            None,
+            Some(""),
+            Some("0123456789AB"), // 12 bytes — inline boundary
+        ])) as ArrayRef;
+        let result = invoke_upper(input)?;
+        let result_sv = result.as_any().downcast_ref::<StringViewArray>().unwrap();
+
+        let expected = StringViewArray::from(vec![
+            Some("HELLO"),
+            None,
+            Some(""),
+            Some("0123456789AB"),
+        ]);
+        assert_eq!(result_sv, &expected);
+        assert_eq!(
+            result_sv.data_buffers().len(),
+            0,
+            "inline-only Utf8View should produce no data buffers"
+        );
+        Ok(())
+    }
+
+    #[test]
+    fn upper_utf8view_long_packs_tight() -> Result<()> {
+        // Mix of long and inline values; the long values should be packed into
+        // a single tight output buffer whose size is exactly the sum of their
+        // lengths (inline values do not contribute).
+        let input = Arc::new(StringViewArray::from(vec![
+            Some("hello world 123"), // 15 bytes (long)
+            Some("abc"),             // inline
+            None,
+            Some("datafusion rocks!"),   // 17 bytes (long)
+            Some("another long string"), // 19 bytes (long)
+        ])) as ArrayRef;
+        let result = invoke_upper(input)?;
+        let result_sv = result.as_any().downcast_ref::<StringViewArray>().unwrap();
+
+        let expected = StringViewArray::from(vec![
+            Some("HELLO WORLD 123"),
+            Some("ABC"),
+            None,
+            Some("DATAFUSION ROCKS!"),
+            Some("ANOTHER LONG STRING"),
+        ]);
+        assert_eq!(result_sv, &expected);
+        assert_eq!(result_sv.data_buffers().len(), 1);
+        assert_eq!(result_sv.data_buffers()[0].len(), 15 + 17 + 19);
+        Ok(())
+    }
+
+    #[test]
+    fn upper_utf8view_splits_into_multiple_buffers() -> Result<()> {
+        // Produce enough long-string output to overflow the first data block
+        // (≈16 KiB after the initial doubling) and confirm the fast path
+        // splits across buffers rather than packing everything into one and
+        // risking the i32::MAX offset limit.
+        const STR_LEN: usize = 500;
+        const N: usize = 40; // 40 × 500 B = 20 KiB total — crosses the first block.
+        let value = "x".repeat(STR_LEN);
+        let inputs: Vec<Option<String>> = (0..N).map(|_| Some(value.clone())).collect();
+        let input = Arc::new(StringViewArray::from(inputs.clone())) as ArrayRef;
+        let result = invoke_upper(input)?;
+        let result_sv = result.as_any().downcast_ref::<StringViewArray>().unwrap();
+
+        let expected_value = "X".repeat(STR_LEN);
+        let expected: Vec<Option<&str>> =
+            (0..N).map(|_| Some(expected_value.as_str())).collect();
+        assert_eq!(result_sv, &StringViewArray::from(expected));
+        assert!(
+            result_sv.data_buffers().len() >= 2,
+            "expected the output to span more than one data buffer, got {}",
+            result_sv.data_buffers().len()
+        );
+        // Total bytes across buffers must equal total long-value bytes
+        // (no row was inlined since each value is > 12 bytes).
+        let total: usize = result_sv.data_buffers().iter().map(|b| b.len()).sum();
+        assert_eq!(total, N * STR_LEN);
+        Ok(())
+    }
+
+    #[test]
+    fn upper_sliced_utf8() -> Result<()> {
+        let parent = Arc::new(StringArray::from(vec![
+            Some("aaaaaaaa"),
+            Some("hello"),
+            Some("world"),
+            Some(""),
+            Some("zzzzzzzz"),
+        ])) as ArrayRef;
+        let sliced = parent.slice(1, 3);
+        let result = invoke_upper(sliced)?;
+        let result_sa = result.as_any().downcast_ref::<StringArray>().unwrap();
+
+        let expected = StringArray::from(vec![Some("HELLO"), Some("WORLD"), Some("")]);
+        assert_eq!(result_sa, &expected);
+        // The slice's addressed bytes are "hello" + "world" = 10; the ASCII
+        // fast path must produce a tight output buffer (not the parent's).
+        assert_eq!(result_sa.value_data().len(), 10);
+        Ok(())
+    }
 }