Skip to content

Commit f67b594

Browse files
authored
chore: clean up Arrow device export (#8359)
Cleanups from a review of the Arrow device export path, no behavior change: --------- Signed-off-by: Alexander Droste <alexander.droste@protonmail.com>
1 parent 89b1092 commit f67b594

5 files changed

Lines changed: 74 additions & 73 deletions

File tree

vortex-cuda/README.md

Lines changed: 13 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -10,35 +10,27 @@ Key files:
1010
- `vortex-cuda/src/arrow/canonical.rs`: canonical-array export to `ArrowDeviceArray`.
1111
- `vortex-test/e2e-cuda/src/lib.rs`: cuDF interop harness.
1212

13-
Current export coverage includes primitive, bool, decimal/temporal, string/binary view, and struct arrays. Remaining work includes null masks, broader dtype coverage, `ArrowDeviceArrayStream`, and PyVortex integration.
13+
## Building cuDF for Arrow Device interop
1414

15-
## cuDF compatibility
15+
The `cudf-test-harness` repository provides prebuilt cuDF binaries for Arrow Device interop testing on
16+
x86_64 and aarch64.
1617

17-
Vortex exports string and binary columns as Arrow `Utf8View` / `BinaryView` device arrays with producer-owned `ArrowArray.private_data`. cuDF string/binary interop requires a build containing `rapidsai/cudf#22620`; until a release version is identified, test with a cuDF commit that includes that change.
18-
19-
## Building cuDF for interop testing
20-
21-
Pass a single CUDA architecture, e.g. `-DCMAKE_CUDA_ARCHITECTURES=90a`; otherwise cuDF builds for many architectures and local builds are much slower.
18+
From the cuDF repository root, compile the Arrow Device interop target locally without exporting additional
19+
environment variables:
2220

2321
```sh
24-
export PATH=/usr/local/cuda-13.1/bin:$PATH
22+
cmake -E rm -rf cpp/build
2523

2624
cmake -S cpp -B cpp/build \
27-
-DCMAKE_INSTALL_PREFIX=${CONDA_PREFIX:-/usr/local} \
28-
-DCMAKE_CUDA_ARCHITECTURES=90a \
25+
-DCMAKE_INSTALL_PREFIX=/usr/local \
26+
-DCMAKE_CUDA_ARCHITECTURES=NATIVE \
2927
-DBUILD_TESTS=ON \
3028
-DDISABLE_DEPRECATION_WARNINGS=ON \
3129
-DCMAKE_BUILD_TYPE=Debug \
3230
-DCUDF_BUILD_STREAMS_TEST_UTIL=OFF \
33-
-DCUDAToolkit_ROOT=/usr/local/cuda-13.1 \
34-
-DCMAKE_CUDA_COMPILER=/usr/local/cuda-13.1/bin/nvcc \
35-
-DCMAKE_CXX_COMPILER=/usr/bin/g++-13 \
36-
-DCMAKE_C_COMPILER=/usr/bin/gcc-13 \
37-
-GNinja
38-
39-
cmake --build cpp/build --target INTEROP_TEST -j$(nproc)
40-
41-
LD_LIBRARY_PATH=/usr/local/cuda-13.1/compat:$LD_LIBRARY_PATH ./cpp/build/gtests/INTEROP_TEST
31+
-DCUDAToolkit_ROOT=/usr/local/cuda \
32+
-DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
33+
-DCMAKE_C_COMPILER=gcc \
34+
-DCMAKE_CXX_COMPILER=g++ \
35+
-GNinja && cmake --build cpp/build --target INTEROP_TEST --parallel
4236
```
43-
44-
Adjust architecture, compiler paths, and CUDA paths for the machine under test.

vortex-cuda/kernels/src/arrow_binary.cu

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,9 @@ __device__ void init_scan_device(const BinaryView *const __restrict views,
6363
const BinaryViewRef *const view_ref = reinterpret_cast<const BinaryViewRef *>(&view);
6464
const uint64_t buffer_index = static_cast<uint64_t>(view_ref->buffer_index);
6565
const uint64_t offset = static_cast<uint64_t>(view_ref->offset);
66+
// Both addends are u32 widened to u64, so the end position cannot wrap.
6667
const uint64_t end = offset + static_cast<uint64_t>(size);
67-
if (buffer_index >= data_buffer_count || end < offset || end > data_buffer_lens[buffer_index]) {
68+
if (buffer_index >= data_buffer_count || end > data_buffer_lens[buffer_index]) {
6869
scan[idx] = 0;
6970
atomicMax(status, 1u);
7071
continue;

vortex-cuda/src/arrow/canonical.rs

Lines changed: 27 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -362,7 +362,7 @@ fn export_canonical(
362362
})
363363
}
364364

365-
/// Export a Vortex dictionary array as an Arrow dictionary array.
365+
/// Export a Vortex dictionary array as an Arrow Device dictionary array.
366366
///
367367
/// Owns the codes buffers and recursively exported dictionary values.
368368
async fn export_dict(
@@ -538,11 +538,7 @@ where
538538
Ok(BufferHandle::new_device(Arc::new(output_device)))
539539
}
540540

541-
/// Export Vortex binary views as standard Arrow `Binary`.
542-
///
543-
/// cuDF imports Arrow `Binary` through the Arrow Device path, but does not currently accept
544-
/// Arrow `BinaryView`. This path keeps conversion on the CUDA stream by building `i32` offsets
545-
/// from view sizes and gathering inline/out-of-line view bytes into one contiguous values buffer.
541+
/// Export Vortex binary views as an Arrow Device array with standard `Binary` layout.
546542
async fn export_binary(
547543
varbinview: VarBinViewArray,
548544
ctx: &mut CudaExecutionCtx,
@@ -764,9 +760,13 @@ fn gather_binary_values(
764760
)
765761
}
766762

767-
/// Export Vortex validity as an Arrow validity byte buffer on the CUDA device.
763+
/// Export Vortex validity as an Arrow Device validity byte buffer.
768764
///
769765
/// Returns `None` for the buffer when Arrow can omit validity because all rows are valid.
766+
///
767+
/// Returned buffers use zeroed 4-byte padding so cuDF's word-sized mask reads stay in bounds.
768+
/// Bits at positions `>= len + arrow_offset` within the final data byte are unspecified, as
769+
/// Arrow permits.
770770
pub(super) async fn export_arrow_validity_buffer(
771771
validity: Validity,
772772
len: usize,
@@ -876,7 +876,7 @@ pub(super) fn repack_arrow_validity_buffer(
876876
Ok(BufferHandle::new_device(Arc::new(output_device)).slice(0..output_bytes))
877877
}
878878

879-
/// Export a standard Vortex list as Arrow `List`: validity, offsets, and one child array.
879+
/// Export a Vortex list-view as an Arrow Device array with `List` layout.
880880
async fn export_list_view(
881881
listview: ListViewArray,
882882
ctx: &mut CudaExecutionCtx,
@@ -903,29 +903,12 @@ async fn export_list_view(
903903
.await
904904
}
905905

906+
/// Export a standard Vortex list as an Arrow Device array with `List` layout.
906907
async fn export_list(
907908
array: ListArray,
908909
child_export: ListChildExport,
909910
ctx: &mut CudaExecutionCtx,
910911
) -> VortexResult<(ArrowArray, SyncEvent)> {
911-
let (elements, len, validity_buffer, null_count, offsets_buffer) =
912-
list_layout_parts(array, ctx).await?;
913-
export_list_layout(
914-
elements,
915-
len,
916-
validity_buffer,
917-
null_count,
918-
offsets_buffer,
919-
child_export,
920-
ctx,
921-
)
922-
.await
923-
}
924-
925-
async fn list_layout_parts(
926-
array: ListArray,
927-
ctx: &mut CudaExecutionCtx,
928-
) -> VortexResult<(ArrayRef, usize, Option<BufferHandle>, i64, BufferHandle)> {
929912
let len = array.len();
930913
let ListDataParts {
931914
elements,
@@ -936,7 +919,16 @@ async fn list_layout_parts(
936919

937920
let (validity_buffer, null_count) = export_arrow_validity_buffer(validity, len, 0, ctx).await?;
938921
let offsets_buffer = export_arrow_list_offsets(offsets, ctx).await?;
939-
Ok((elements, len, validity_buffer, null_count, offsets_buffer))
922+
export_list_layout(
923+
elements,
924+
len,
925+
validity_buffer,
926+
null_count,
927+
offsets_buffer,
928+
child_export,
929+
ctx,
930+
)
931+
.await
940932
}
941933

942934
#[derive(Clone, Copy)]
@@ -946,6 +938,11 @@ pub(super) enum ListChildExport {
946938
PreserveConcreteLayout,
947939
/// Canonicalize temporary encodings introduced by the host ListView
948940
/// rebuild, while still preserving rebuilt dictionary children.
941+
///
942+
/// This is not equivalent to `export_array`: the take-based rebuild wraps
943+
/// children in transient encodings (for example `take` returns `Dict`
944+
/// arrays nested inside struct fields) that the pre-computed export schema
945+
/// does not include, so they must be canonicalized away before export.
949946
RebuiltListViewChild,
950947
}
951948

@@ -968,7 +965,7 @@ impl ListChildExport {
968965
}
969966
}
970967

971-
/// Build the shared Arrow `List` parent once offsets and validity are ready on device.
968+
/// Build the shared Arrow Device `List` parent once offsets and validity are ready.
972969
pub(super) async fn export_list_layout(
973970
elements: ArrayRef,
974971
len: usize,
@@ -979,24 +976,7 @@ pub(super) async fn export_list_layout(
979976
ctx: &mut CudaExecutionCtx,
980977
) -> VortexResult<(ArrowArray, SyncEvent)> {
981978
let elements_child = child_export.export(elements, ctx).await?;
982-
export_list_layout_with_child(
983-
elements_child,
984-
len,
985-
validity_buffer,
986-
null_count,
987-
offsets_buffer,
988-
ctx,
989-
)
990-
}
991979

992-
fn export_list_layout_with_child(
993-
elements_child: ArrowArray,
994-
len: usize,
995-
validity_buffer: Option<BufferHandle>,
996-
null_count: i64,
997-
offsets_buffer: BufferHandle,
998-
ctx: &mut CudaExecutionCtx,
999-
) -> VortexResult<(ArrowArray, SyncEvent)> {
1000980
let mut private_data = PrivateData::new(
1001981
vec![validity_buffer, Some(offsets_buffer)],
1002982
vec![elements_child],
@@ -1017,7 +997,7 @@ fn export_list_layout_with_child(
1017997
Ok((arrow_list, sync_event))
1018998
}
1019999

1020-
/// Export a Vortex fixed-size-list as Arrow `List`.
1000+
/// Export a Vortex fixed-size-list as an Arrow Device array with `List` layout.
10211001
///
10221002
/// cuDF's Arrow Device import accepts `List`/`LargeList` as cuDF `LIST`, but rejects
10231003
/// `FixedSizeList`, so emit equivalent standard Arrow `List` offsets.
@@ -1069,7 +1049,7 @@ async fn fixed_size_list_offsets(
10691049
.await
10701050
}
10711051

1072-
/// Return cuDF-supported Arrow `List` offsets as an `i32` device buffer.
1052+
/// Return Arrow Device `List` offsets as an `i32` device buffer.
10731053
async fn export_arrow_list_offsets(
10741054
offsets: ArrayRef,
10751055
ctx: &mut CudaExecutionCtx,

vortex-cuda/src/arrow/list_view.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ use crate::CudaExecutionCtx;
3939
use crate::cub::exclusive_sum_i32;
4040
use crate::executor::CudaArrayExt;
4141

42-
/// Export a Vortex list-view as Arrow `List` using device kernels.
42+
/// Export a Vortex list-view as an Arrow Device array with `List` layout using device kernels.
4343
///
4444
/// Reuses contiguous children; rebuilds non-contiguous primitive or dictionary-code children.
4545
pub(super) async fn export_device_list_view(
@@ -121,7 +121,7 @@ enum DeviceListViewOffsets {
121121
RequiresRebuild,
122122
}
123123

124-
/// Build cuDF-supported `i32` Arrow `List` offsets from list-view offset/size device buffers.
124+
/// Build Arrow Device `List` offsets from list-view offset/size device buffers.
125125
#[expect(clippy::cognitive_complexity)]
126126
async fn export_device_list_view_offsets(
127127
offsets_ptype: PType,
@@ -391,7 +391,7 @@ async fn export_rebuilt_dict_list_view(
391391
.await
392392
}
393393

394-
/// Rebuild a non-contiguous primitive list-view child and export it as an Arrow List.
394+
/// Rebuild a non-contiguous primitive list-view child and export it as Arrow Device `List`.
395395
#[expect(clippy::too_many_arguments)]
396396
async fn export_rebuilt_primitive_list_view(
397397
elements: ArrayRef,

vortex-cuda/src/dynamic_dispatch/plan_builder.rs

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,9 @@ fn is_dyn_dispatch_cast_compatible(array: &ArrayRef) -> bool {
139139
let Ok(source_ptype) = PType::try_from(cast.child_at(0).dtype()) else {
140140
return false;
141141
};
142-
let target_ptype = cast.scalar_fn().as_::<Cast>().as_ptype();
142+
let Ok(target_ptype) = PType::try_from(cast.scalar_fn().as_::<Cast>()) else {
143+
return false;
144+
};
143145

144146
// Implemented as unsigned dictionary-code casts to cuDF's signed index types.
145147
// LOAD/BITUNPACK materialize directly into the target-width output type.
@@ -892,3 +894,29 @@ impl FusedPlan {
892894
len * final_elem_bytes.max(output_elem_bytes)
893895
}
894896
}
897+
898+
#[cfg(test)]
899+
mod tests {
900+
use vortex::array::IntoArray;
901+
use vortex::array::arrays::PrimitiveArray;
902+
use vortex::array::builtins::ArrayBuiltins;
903+
use vortex::dtype::DType;
904+
use vortex::dtype::Nullability;
905+
906+
use super::*;
907+
908+
#[test]
909+
fn cast_to_non_primitive_target_is_not_dyn_dispatch_compatible() -> VortexResult<()> {
910+
let cast = PrimitiveArray::from_iter([0u8, 1])
911+
.into_array()
912+
.cast(DType::Bool(Nullability::NonNullable))?;
913+
914+
assert!(!is_dyn_dispatch_cast_compatible(&cast));
915+
assert!(matches!(
916+
DispatchPlan::new(&cast, CudaDispatchMode::DynDispatchOnly)?,
917+
DispatchPlan::Unfused
918+
));
919+
920+
Ok(())
921+
}
922+
}

0 commit comments

Comments
 (0)