feat[gpu]: slice support for CUDA dyn dispatch

0ax1 · 0ax1 · commit db5ead625c21 · 2026-02-27T13:04:43.000Z
Signed-off-by: Alexander Droste &lt;alexander.droste@protonmail.com&gt;
diff --git a/encodings/fastlanes/public-api.lock b/encodings/fastlanes/public-api.lock
@@ -196,9 +196,9 @@ impl vortex_array::arrays::filter::kernel::FilterKernel for vortex_fastlanes::Bi
 
 pub fn vortex_fastlanes::BitPackedVTable::filter(array: &vortex_fastlanes::BitPackedArray, mask: &vortex_mask::Mask, _ctx: &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<vortex_array::array::ArrayRef>>
 
-impl vortex_array::arrays::slice::SliceKernel for vortex_fastlanes::BitPackedVTable
+impl vortex_array::arrays::slice::SliceReduce for vortex_fastlanes::BitPackedVTable
 
-pub fn vortex_fastlanes::BitPackedVTable::slice(array: &vortex_fastlanes::BitPackedArray, range: core::ops::range::Range<usize>, _ctx: &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<vortex_array::array::ArrayRef>>
+pub fn vortex_fastlanes::BitPackedVTable::slice(array: &vortex_fastlanes::BitPackedArray, range: core::ops::range::Range<usize>) -> vortex_error::VortexResult<core::option::Option<vortex_array::array::ArrayRef>>
 
 impl vortex_array::compute::is_constant::IsConstantKernel for vortex_fastlanes::BitPackedVTable
 
diff --git a/encodings/fastlanes/src/bitpacking/compute/slice.rs b/encodings/fastlanes/src/bitpacking/compute/slice.rs
@@ -5,20 +5,15 @@ use std::cmp::max;
 use std::ops::Range;
 
 use vortex_array::ArrayRef;
-use vortex_array::ExecutionCtx;
 use vortex_array::IntoArray;
-use vortex_array::arrays::SliceKernel;
+use vortex_array::arrays::SliceReduce;
 use vortex_error::VortexResult;
 
 use crate::BitPackedArray;
 use crate::BitPackedVTable;
 
-impl SliceKernel for BitPackedVTable {
-    fn slice(
-        array: &BitPackedArray,
-        range: Range<usize>,
-        _ctx: &mut ExecutionCtx,
-    ) -> VortexResult<Option<ArrayRef>> {
+impl SliceReduce for BitPackedVTable {
+    fn slice(array: &BitPackedArray, range: Range<usize>) -> VortexResult<Option<ArrayRef>> {
         let offset_start = range.start + array.offset() as usize;
         let offset_stop = range.end + array.offset() as usize;
         let offset = offset_start % 1024;
@@ -51,43 +46,44 @@ impl SliceKernel for BitPackedVTable {
 
 #[cfg(test)]
 mod tests {
-    use std::sync::LazyLock;
-
     use vortex_array::Array;
-    use vortex_array::IntoArray;
-    use vortex_array::VortexSessionExecute;
-    use vortex_array::arrays::SliceArray;
-    use vortex_array::session::ArraySession;
-    use vortex_array::vtable::VTable;
+    use vortex_array::arrays::SliceReduce;
+    use vortex_array::arrays::SliceVTable;
     use vortex_error::VortexResult;
-    use vortex_session::VortexSession;
 
     use crate::BitPackedVTable;
     use crate::bitpack_compress::bitpack_encode;
 
-    static SESSION: LazyLock<VortexSession> =
-        LazyLock::new(|| VortexSession::empty().with::<ArraySession>());
+    #[test]
+    fn test_slice_returns_bitpacked() -> VortexResult<()> {
+        let values = vortex_array::arrays::PrimitiveArray::from_iter(0u32..2048);
+        let bitpacked = bitpack_encode(&values, 11, None)?;
+
+        let result =
+            BitPackedVTable::slice(&bitpacked, 500..1500)?.expect("expected slice to succeed");
+
+        assert!(result.is::<BitPackedVTable>());
+        let result_bp = result.as_::<BitPackedVTable>();
+        assert_eq!(result_bp.offset(), 500);
+        assert_eq!(result.len(), 1000);
+
+        Ok(())
+    }
 
     #[test]
-    fn test_execute_parent_returns_bitpacked_slice() -> VortexResult<()> {
+    fn test_slice_via_array_trait() -> VortexResult<()> {
         let values = vortex_array::arrays::PrimitiveArray::from_iter(0u32..2048);
         let bitpacked = bitpack_encode(&values, 11, None)?;
 
-        let slice_array = SliceArray::new(bitpacked.clone().into_array(), 500..1500);
-
-        let mut ctx = SESSION.create_execution_ctx();
-        let reduced = <BitPackedVTable as VTable>::execute_parent(
-            &bitpacked,
-            &slice_array.into_array(),
-            0,
-            &mut ctx,
-        )?
-        .expect("expected slice kernel to execute");
-
-        assert!(reduced.is::<BitPackedVTable>());
-        let reduced_bp = reduced.as_::<BitPackedVTable>();
-        assert_eq!(reduced_bp.offset(), 500);
-        assert_eq!(reduced.len(), 1000);
+        let sliced = bitpacked.as_ref().slice(500..1500)?;
+
+        // After optimize, the SliceArray should have been reduced away.
+        assert!(
+            !sliced.is::<SliceVTable>(),
+            "expected SliceReduce to eliminate the SliceArray wrapper"
+        );
+        assert!(sliced.is::<BitPackedVTable>());
+        assert_eq!(sliced.len(), 1000);
 
         Ok(())
     }
diff --git a/encodings/fastlanes/src/bitpacking/vtable/kernels.rs b/encodings/fastlanes/src/bitpacking/vtable/kernels.rs
@@ -2,14 +2,12 @@
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
 use vortex_array::arrays::FilterExecuteAdaptor;
-use vortex_array::arrays::SliceExecuteAdaptor;
 use vortex_array::arrays::TakeExecuteAdaptor;
 use vortex_array::kernel::ParentKernelSet;
 
 use crate::BitPackedVTable;
 
 pub(crate) const PARENT_KERNELS: ParentKernelSet<BitPackedVTable> = ParentKernelSet::new(&[
     ParentKernelSet::lift(&FilterExecuteAdaptor(BitPackedVTable)),
-    ParentKernelSet::lift(&SliceExecuteAdaptor(BitPackedVTable)),
     ParentKernelSet::lift(&TakeExecuteAdaptor(BitPackedVTable)),
 ]);
diff --git a/encodings/fastlanes/src/bitpacking/vtable/operations.rs b/encodings/fastlanes/src/bitpacking/vtable/operations.rs
@@ -26,13 +26,10 @@ impl OperationsVTable<BitPackedVTable> for BitPackedVTable {
 #[cfg(test)]
 mod test {
     use std::ops::Range;
-    use std::sync::LazyLock;
 
     use vortex_array::Array;
     use vortex_array::IntoArray;
-    use vortex_array::VortexSessionExecute;
     use vortex_array::arrays::PrimitiveArray;
-    use vortex_array::arrays::SliceArray;
     use vortex_array::assert_arrays_eq;
     use vortex_array::assert_nth_scalar;
     use vortex_array::buffer::BufferHandle;
@@ -41,9 +38,7 @@ mod test {
     use vortex_array::dtype::PType;
     use vortex_array::patches::Patches;
     use vortex_array::scalar::Scalar;
-    use vortex_array::session::ArraySession;
     use vortex_array::validity::Validity;
-    use vortex_array::vtable::VTable;
     use vortex_buffer::Alignment;
     use vortex_buffer::Buffer;
     use vortex_buffer::ByteBuffer;
@@ -52,20 +47,8 @@ mod test {
     use crate::BitPackedArray;
     use crate::BitPackedVTable;
 
-    static SESSION: LazyLock<vortex_session::VortexSession> =
-        LazyLock::new(|| vortex_session::VortexSession::empty().with::<ArraySession>());
-
     fn slice_via_kernel(array: &BitPackedArray, range: Range<usize>) -> BitPackedArray {
-        let slice_array = SliceArray::new(array.clone().into_array(), range);
-        let mut ctx = SESSION.create_execution_ctx();
-        let sliced = <BitPackedVTable as VTable>::execute_parent(
-            array,
-            &slice_array.into_array(),
-            0,
-            &mut ctx,
-        )
-        .expect("execute_parent failed")
-        .expect("expected slice kernel to execute");
+        let sliced = array.as_ref().slice(range).expect("slice failed");
         sliced.as_::<BitPackedVTable>().clone()
     }
 
diff --git a/encodings/fastlanes/src/bitpacking/vtable/rules.rs b/encodings/fastlanes/src/bitpacking/vtable/rules.rs
@@ -1,10 +1,13 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
+use vortex_array::arrays::SliceReduceAdaptor;
 use vortex_array::optimizer::rules::ParentRuleSet;
 use vortex_array::scalar_fn::fns::cast::CastReduceAdaptor;
 
 use crate::BitPackedVTable;
 
-pub(crate) const RULES: ParentRuleSet<BitPackedVTable> =
-    ParentRuleSet::new(&[ParentRuleSet::lift(&CastReduceAdaptor(BitPackedVTable))]);
+pub(crate) const RULES: ParentRuleSet<BitPackedVTable> = ParentRuleSet::new(&[
+    ParentRuleSet::lift(&CastReduceAdaptor(BitPackedVTable)),
+    ParentRuleSet::lift(&SliceReduceAdaptor(BitPackedVTable)),
+]);
diff --git a/vortex-cuda/gpu-scan-cli/src/main.rs b/vortex-cuda/gpu-scan-cli/src/main.rs
@@ -94,7 +94,7 @@ async fn main() -> VortexResult<()> {
 
     // Create a full scan that executes on the GPU
     let cuda_stream =
-        VortexCudaStreamPool::new(Arc::clone(cuda_ctx.stream().context()), 1).get_stream()?;
+        VortexCudaStreamPool::new(Arc::clone(cuda_ctx.stream().context()), 1).stream()?;
     let gpu_reader = CopyDeviceReadAt::new(recompressed, cuda_stream);
 
     let gpu_file = session
diff --git a/vortex-cuda/kernels/src/dynamic_dispatch.cu b/vortex-cuda/kernels/src/dynamic_dispatch.cu
@@ -56,8 +56,10 @@ __device__ inline void dynamic_source_op(const T *__restrict input,
         constexpr uint32_t FL_CHUNK_SIZE = 1024;
         constexpr uint32_t LANES_PER_FL_BLOCK = FL_CHUNK_SIZE / T_BITS;
         const uint32_t bit_width = source_op.params.bitunpack.bit_width;
+        const uint32_t element_offset = source_op.params.bitunpack.element_offset;
         const uint32_t packed_words_per_fl_block = LANES_PER_FL_BLOCK * bit_width;
-        const uint64_t first_fl_block = chunk_start / FL_CHUNK_SIZE;
+        // Shift chunk_start by the sub-block element offset.
+        const uint64_t first_fl_block = (chunk_start + element_offset) / FL_CHUNK_SIZE;
 
         // FL blocks must divide evenly. Otherwise, the last unpack would overflow smem.
         static_assert((ELEMENTS_PER_BLOCK % FL_CHUNK_SIZE) == 0);
diff --git a/vortex-cuda/kernels/src/dynamic_dispatch.h b/vortex-cuda/kernels/src/dynamic_dispatch.h
@@ -44,9 +44,12 @@ union SourceParams {
     /// Unpack bit-packed data using FastLanes layout.
     struct BitunpackParams {
         uint8_t bit_width;
+        uint8_t _padding[3];
+        uint32_t element_offset; // Element offset within FL block (0..1023)
     } bitunpack;
 
     /// Copy elements verbatim from global memory to shared memory.
+    /// The input pointer is pre-adjusted on the host to account for slicing.
     struct LoadParams {
         uint8_t _padding;
     } load;
diff --git a/vortex-cuda/src/device_buffer.rs b/vortex-cuda/src/device_buffer.rs
@@ -81,8 +81,6 @@ mod private {
     }
 }
 
-// Get it back out as a View of u8
-
 impl CudaDeviceBuffer {
     /// Creates a new CUDA device buffer from a [`CudaSlice<T>`].
     ///
@@ -101,6 +99,16 @@ impl CudaDeviceBuffer {
         }
     }
 
+    /// Returns the byte offset within the allocated buffer.
+    pub fn offset(&self) -> usize {
+        self.offset
+    }
+
+    /// Returns the adjusted device pointer accounting for the offset.
+    pub fn offset_ptr(&self) -> sys::CUdeviceptr {
+        self.device_ptr + self.offset as u64
+    }
+
     /// Returns a [`CudaView`] to the CUDA device buffer.
     pub fn as_view<T: DeviceRepr + 'static>(&self) -> CudaView<'_, T> {
         // Return a new &[T]
@@ -159,7 +167,7 @@ impl CudaBufferExt for BufferHandle {
             .as_any()
             .downcast_ref::<CudaDeviceBuffer>()
             .ok_or_else(|| vortex_err!("expected CudaDeviceBuffer"))?
-            .device_ptr;
+            .offset_ptr();
 
         Ok(ptr)
     }
@@ -279,41 +287,6 @@ impl DeviceBuffer for CudaDeviceBuffer {
         }))
     }
 
-    /// Slices the CUDA device buffer to a subrange.
-    ///
-    /// **IMPORTANT**: this is a byte range, not elements range, due to the DeviceBuffer interface.
-    fn slice(&self, range: Range<usize>) -> Arc<dyn DeviceBuffer> {
-        assert!(
-            range.end <= self.len,
-            "Slice range end {} exceeds allocation size {}",
-            range.end,
-            self.len
-        );
-
-        let new_offset = self.offset + range.start;
-        let new_len = range.end - range.start;
-
-        let trailing = (self.device_ptr + new_offset as u64).trailing_zeros();
-        let exponent =
-            u8::try_from(min(15, trailing)).vortex_expect("min(15, x) always fits in u8");
-        let slice_align = Alignment::from_exponent(exponent);
-
-        assert!(
-            slice_align.is_aligned_to(self.allocation.alignment()),
-            "slice must respect minimum alignment {}, min {}",
-            slice_align,
-            self.allocation.alignment()
-        );
-
-        Arc::new(CudaDeviceBuffer {
-            allocation: Arc::clone(&self.allocation),
-            offset: new_offset,
-            len: new_len,
-            device_ptr: self.device_ptr,
-            alignment: self.alignment,
-        })
-    }
-
     fn as_any(&self) -> &dyn std::any::Any {
         self
     }
diff --git a/vortex-cuda/src/dynamic_dispatch/mod.rs b/vortex-cuda/src/dynamic_dispatch/mod.rs
diff --git a/vortex-cuda/src/dynamic_dispatch/plan_builder.rs b/vortex-cuda/src/dynamic_dispatch/plan_builder.rs
diff --git a/vortex-cuda/src/kernel/encodings/bitpacked.rs b/vortex-cuda/src/kernel/encodings/bitpacked.rs