|
15 | 15 | // specific language governing permissions and limitations |
16 | 16 | // under the License. |
17 | 17 |
|
18 | | -use arrow::array::{Array, ArrayRef, GenericStringArray, ListArray}; |
| 18 | +use arrow::array::{ |
| 19 | + Array, ArrayBuilder, ArrayRef, GenericListArray, GenericStringArray, GenericStringBuilder, |
| 20 | + ListArray, OffsetSizeTrait, |
| 21 | +}; |
| 22 | +use arrow::buffer::OffsetBuffer; |
19 | 23 | use arrow::datatypes::{DataType, Field}; |
20 | 24 | use datafusion::common::{ |
21 | 25 | cast::as_generic_string_array, exec_err, DataFusionError, Result as DataFusionResult, |
@@ -115,89 +119,99 @@ fn split_array( |
115 | 119 | DataFusionError::Execution(format!("Invalid regex pattern '{}': {}", pattern, e)) |
116 | 120 | })?; |
117 | 121 |
|
118 | | - let string_array = match string_array.data_type() { |
119 | | - DataType::Utf8 => as_generic_string_array::<i32>(string_array)?, |
120 | | - DataType::LargeUtf8 => { |
121 | | - // Convert LargeUtf8 to Utf8 for processing |
122 | | - let large_array = as_generic_string_array::<i64>(string_array)?; |
123 | | - return split_large_string_array(large_array, ®ex, limit); |
| 122 | + match string_array.data_type() { |
| 123 | + DataType::Utf8 => { |
| 124 | + split_generic::<i32>(as_generic_string_array::<i32>(string_array)?, ®ex, limit) |
124 | 125 | } |
125 | | - _ => { |
126 | | - return exec_err!( |
127 | | - "split expects Utf8 or LargeUtf8 string array, got {:?}", |
128 | | - string_array.data_type() |
129 | | - ); |
| 126 | + DataType::LargeUtf8 => { |
| 127 | + split_generic::<i64>(as_generic_string_array::<i64>(string_array)?, ®ex, limit) |
130 | 128 | } |
131 | | - }; |
| 129 | + _ => exec_err!( |
| 130 | + "split expects Utf8 or LargeUtf8 string array, got {:?}", |
| 131 | + string_array.data_type() |
| 132 | + ), |
| 133 | + } |
| 134 | +} |
132 | 135 |
|
133 | | - // Build the result ListArray |
134 | | - let mut offsets: Vec<i32> = Vec::with_capacity(string_array.len() + 1); |
135 | | - let mut values: Vec<String> = Vec::new(); |
136 | | - let mut null_buffer_builder = arrow::array::BooleanBufferBuilder::new(string_array.len()); |
137 | | - offsets.push(0); |
138 | | - |
139 | | - for i in 0..string_array.len() { |
140 | | - if string_array.is_null(i) { |
141 | | - // NULL input produces NULL in result (Spark behavior) |
142 | | - offsets.push(offsets[i]); |
143 | | - null_buffer_builder.append(false); // false = NULL |
144 | | - } else { |
145 | | - let string_val = string_array.value(i); |
146 | | - let parts = split_with_regex(string_val, ®ex, limit); |
147 | | - values.extend(parts); |
148 | | - offsets.push(values.len() as i32); |
149 | | - null_buffer_builder.append(true); // true = valid |
| 136 | +fn split_generic<O: OffsetSizeTrait>( |
| 137 | + string_array: &GenericStringArray<O>, |
| 138 | + regex: &Regex, |
| 139 | + limit: i32, |
| 140 | +) -> DataFusionResult<ColumnarValue> { |
| 141 | + let len = string_array.len(); |
| 142 | + let mut offsets: Vec<O> = Vec::with_capacity(len + 1); |
| 143 | + let mut values_builder = GenericStringBuilder::<O>::new(); |
| 144 | + offsets.push(O::usize_as(0)); |
| 145 | + |
| 146 | + // Bulk-NULL: output null mask equals input's, so reuse it instead of |
| 147 | + // tracking per-row in a NullBufferBuilder. Null rows contribute no parts |
| 148 | + // (offset does not advance) and the cloned NullBuffer marks them. |
| 149 | + for i in 0..len { |
| 150 | + if !string_array.is_null(i) { |
| 151 | + let s = string_array.value(i); |
| 152 | + push_split_parts(s, regex, limit, &mut values_builder); |
150 | 153 | } |
| 154 | + offsets.push(O::usize_as(values_builder.len())); |
151 | 155 | } |
152 | 156 |
|
153 | | - let values_array = Arc::new(GenericStringArray::<i32>::from(values)) as ArrayRef; |
154 | | - let field = Arc::new(Field::new("item", DataType::Utf8, false)); |
155 | | - let nulls = arrow::buffer::NullBuffer::new(null_buffer_builder.finish()); |
156 | | - let list_array = ListArray::new( |
| 157 | + let values_array = Arc::new(values_builder.finish()) as ArrayRef; |
| 158 | + let item_type = if O::IS_LARGE { |
| 159 | + DataType::LargeUtf8 |
| 160 | + } else { |
| 161 | + DataType::Utf8 |
| 162 | + }; |
| 163 | + let field = Arc::new(Field::new("item", item_type, false)); |
| 164 | + let list_array = GenericListArray::<O>::new( |
157 | 165 | field, |
158 | | - arrow::buffer::OffsetBuffer::new(offsets.into()), |
| 166 | + OffsetBuffer::new(offsets.into()), |
159 | 167 | values_array, |
160 | | - Some(nulls), |
| 168 | + string_array.nulls().cloned(), |
161 | 169 | ); |
162 | 170 |
|
163 | 171 | Ok(ColumnarValue::Array(Arc::new(list_array))) |
164 | 172 | } |
165 | 173 |
|
166 | | -fn split_large_string_array( |
167 | | - string_array: &GenericStringArray<i64>, |
| 174 | +/// Push the splits of `string` into `builder`. Avoids materializing an |
| 175 | +/// intermediate `Vec<String>` — appends each `&str` slice from the regex |
| 176 | +/// iterator directly (the builder copies into its own buffer). |
| 177 | +fn push_split_parts<O: OffsetSizeTrait>( |
| 178 | + string: &str, |
168 | 179 | regex: &Regex, |
169 | 180 | limit: i32, |
170 | | -) -> DataFusionResult<ColumnarValue> { |
171 | | - let mut offsets: Vec<i32> = Vec::with_capacity(string_array.len() + 1); |
172 | | - let mut values: Vec<String> = Vec::new(); |
173 | | - let mut null_buffer_builder = arrow::array::BooleanBufferBuilder::new(string_array.len()); |
174 | | - offsets.push(0); |
175 | | - |
176 | | - for i in 0..string_array.len() { |
177 | | - if string_array.is_null(i) { |
178 | | - // NULL input produces NULL in result (Spark behavior) |
179 | | - offsets.push(offsets[i]); |
180 | | - null_buffer_builder.append(false); // false = NULL |
| 181 | + builder: &mut GenericStringBuilder<O>, |
| 182 | +) { |
| 183 | + if limit == 0 { |
| 184 | + // limit = 0: split all, drop trailing empties. Need to know the end |
| 185 | + // before pushing, so collect borrowed slices first (no string copies). |
| 186 | + let mut parts: Vec<&str> = regex.split(string).collect(); |
| 187 | + while parts.last().is_some_and(|s| s.is_empty()) { |
| 188 | + parts.pop(); |
| 189 | + } |
| 190 | + if parts.is_empty() { |
| 191 | + builder.append_value(""); |
181 | 192 | } else { |
182 | | - let string_val = string_array.value(i); |
183 | | - let parts = split_with_regex(string_val, regex, limit); |
184 | | - values.extend(parts); |
185 | | - offsets.push(values.len() as i32); |
186 | | - null_buffer_builder.append(true); // true = valid |
| 193 | + for p in parts { |
| 194 | + builder.append_value(p); |
| 195 | + } |
| 196 | + } |
| 197 | + } else if limit > 0 { |
| 198 | + // limit > 0: at most limit-1 splits. |
| 199 | + let mut last_end = 0; |
| 200 | + let cap = (limit - 1) as usize; |
| 201 | + for (count, mat) in regex.find_iter(string).enumerate() { |
| 202 | + if count >= cap { |
| 203 | + break; |
| 204 | + } |
| 205 | + builder.append_value(&string[last_end..mat.start()]); |
| 206 | + last_end = mat.end(); |
| 207 | + } |
| 208 | + builder.append_value(&string[last_end..]); |
| 209 | + } else { |
| 210 | + // limit < 0: split all, keep trailing empties. |
| 211 | + for p in regex.split(string) { |
| 212 | + builder.append_value(p); |
187 | 213 | } |
188 | 214 | } |
189 | | - |
190 | | - let values_array = Arc::new(GenericStringArray::<i32>::from(values)) as ArrayRef; |
191 | | - let field = Arc::new(Field::new("item", DataType::Utf8, false)); |
192 | | - let nulls = arrow::buffer::NullBuffer::new(null_buffer_builder.finish()); |
193 | | - let list_array = ListArray::new( |
194 | | - field, |
195 | | - arrow::buffer::OffsetBuffer::new(offsets.into()), |
196 | | - values_array, |
197 | | - Some(nulls), |
198 | | - ); |
199 | | - |
200 | | - Ok(ColumnarValue::Array(Arc::new(list_array))) |
201 | 215 | } |
202 | 216 |
|
203 | 217 | fn split_string(string: &str, pattern: &str, limit: i32) -> DataFusionResult<Vec<String>> { |
|
0 commit comments