Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions encodings/fastlanes/public-api.lock
Original file line number Diff line number Diff line change
Expand Up @@ -196,9 +196,9 @@ impl vortex_array::arrays::filter::kernel::FilterKernel for vortex_fastlanes::Bi

pub fn vortex_fastlanes::BitPackedVTable::filter(array: &vortex_fastlanes::BitPackedArray, mask: &vortex_mask::Mask, _ctx: &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<vortex_array::array::ArrayRef>>

impl vortex_array::arrays::slice::SliceKernel for vortex_fastlanes::BitPackedVTable
impl vortex_array::arrays::slice::SliceReduce for vortex_fastlanes::BitPackedVTable

pub fn vortex_fastlanes::BitPackedVTable::slice(array: &vortex_fastlanes::BitPackedArray, range: core::ops::range::Range<usize>, _ctx: &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<vortex_array::array::ArrayRef>>
pub fn vortex_fastlanes::BitPackedVTable::slice(array: &vortex_fastlanes::BitPackedArray, range: core::ops::range::Range<usize>) -> vortex_error::VortexResult<core::option::Option<vortex_array::array::ArrayRef>>

impl vortex_array::compute::is_constant::IsConstantKernel for vortex_fastlanes::BitPackedVTable

Expand Down
64 changes: 30 additions & 34 deletions encodings/fastlanes/src/bitpacking/compute/slice.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,15 @@ use std::cmp::max;
use std::ops::Range;

use vortex_array::ArrayRef;
use vortex_array::ExecutionCtx;
use vortex_array::IntoArray;
use vortex_array::arrays::SliceKernel;
use vortex_array::arrays::SliceReduce;
use vortex_error::VortexResult;

use crate::BitPackedArray;
use crate::BitPackedVTable;

impl SliceKernel for BitPackedVTable {
fn slice(
array: &BitPackedArray,
range: Range<usize>,
_ctx: &mut ExecutionCtx,
) -> VortexResult<Option<ArrayRef>> {
impl SliceReduce for BitPackedVTable {
fn slice(array: &BitPackedArray, range: Range<usize>) -> VortexResult<Option<ArrayRef>> {
let offset_start = range.start + array.offset() as usize;
let offset_stop = range.end + array.offset() as usize;
let offset = offset_start % 1024;
Expand Down Expand Up @@ -51,43 +46,44 @@ impl SliceKernel for BitPackedVTable {

#[cfg(test)]
mod tests {
use std::sync::LazyLock;

use vortex_array::Array;
use vortex_array::IntoArray;
use vortex_array::VortexSessionExecute;
use vortex_array::arrays::SliceArray;
use vortex_array::session::ArraySession;
use vortex_array::vtable::VTable;
use vortex_array::arrays::SliceReduce;
use vortex_array::arrays::SliceVTable;
use vortex_error::VortexResult;
use vortex_session::VortexSession;

use crate::BitPackedVTable;
use crate::bitpack_compress::bitpack_encode;

static SESSION: LazyLock<VortexSession> =
LazyLock::new(|| VortexSession::empty().with::<ArraySession>());
#[test]
fn test_slice_returns_bitpacked() -> VortexResult<()> {
let values = vortex_array::arrays::PrimitiveArray::from_iter(0u32..2048);
let bitpacked = bitpack_encode(&values, 11, None)?;

let result =
BitPackedVTable::slice(&bitpacked, 500..1500)?.expect("expected slice to succeed");

assert!(result.is::<BitPackedVTable>());
let result_bp = result.as_::<BitPackedVTable>();
assert_eq!(result_bp.offset(), 500);
assert_eq!(result.len(), 1000);

Ok(())
}

#[test]
fn test_execute_parent_returns_bitpacked_slice() -> VortexResult<()> {
fn test_slice_via_array_trait() -> VortexResult<()> {
let values = vortex_array::arrays::PrimitiveArray::from_iter(0u32..2048);
let bitpacked = bitpack_encode(&values, 11, None)?;

let slice_array = SliceArray::new(bitpacked.clone().into_array(), 500..1500);

let mut ctx = SESSION.create_execution_ctx();
let reduced = <BitPackedVTable as VTable>::execute_parent(
&bitpacked,
&slice_array.into_array(),
0,
&mut ctx,
)?
.expect("expected slice kernel to execute");

assert!(reduced.is::<BitPackedVTable>());
let reduced_bp = reduced.as_::<BitPackedVTable>();
assert_eq!(reduced_bp.offset(), 500);
assert_eq!(reduced.len(), 1000);
let sliced = bitpacked.as_ref().slice(500..1500)?;

// After optimize, the SliceArray should have been reduced away.
assert!(
!sliced.is::<SliceVTable>(),
"expected SliceReduce to eliminate the SliceArray wrapper"
);
assert!(sliced.is::<BitPackedVTable>());
assert_eq!(sliced.len(), 1000);

Ok(())
}
Expand Down
2 changes: 0 additions & 2 deletions encodings/fastlanes/src/bitpacking/vtable/kernels.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,12 @@
// SPDX-FileCopyrightText: Copyright the Vortex contributors

use vortex_array::arrays::FilterExecuteAdaptor;
use vortex_array::arrays::SliceExecuteAdaptor;
use vortex_array::arrays::TakeExecuteAdaptor;
use vortex_array::kernel::ParentKernelSet;

use crate::BitPackedVTable;

pub(crate) const PARENT_KERNELS: ParentKernelSet<BitPackedVTable> = ParentKernelSet::new(&[
ParentKernelSet::lift(&FilterExecuteAdaptor(BitPackedVTable)),
ParentKernelSet::lift(&SliceExecuteAdaptor(BitPackedVTable)),
ParentKernelSet::lift(&TakeExecuteAdaptor(BitPackedVTable)),
]);
19 changes: 1 addition & 18 deletions encodings/fastlanes/src/bitpacking/vtable/operations.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,10 @@ impl OperationsVTable<BitPackedVTable> for BitPackedVTable {
#[cfg(test)]
mod test {
use std::ops::Range;
use std::sync::LazyLock;

use vortex_array::Array;
use vortex_array::IntoArray;
use vortex_array::VortexSessionExecute;
use vortex_array::arrays::PrimitiveArray;
use vortex_array::arrays::SliceArray;
use vortex_array::assert_arrays_eq;
use vortex_array::assert_nth_scalar;
use vortex_array::buffer::BufferHandle;
Expand All @@ -41,9 +38,7 @@ mod test {
use vortex_array::dtype::PType;
use vortex_array::patches::Patches;
use vortex_array::scalar::Scalar;
use vortex_array::session::ArraySession;
use vortex_array::validity::Validity;
use vortex_array::vtable::VTable;
use vortex_buffer::Alignment;
use vortex_buffer::Buffer;
use vortex_buffer::ByteBuffer;
Expand All @@ -52,20 +47,8 @@ mod test {
use crate::BitPackedArray;
use crate::BitPackedVTable;

static SESSION: LazyLock<vortex_session::VortexSession> =
LazyLock::new(|| vortex_session::VortexSession::empty().with::<ArraySession>());

fn slice_via_kernel(array: &BitPackedArray, range: Range<usize>) -> BitPackedArray {
let slice_array = SliceArray::new(array.clone().into_array(), range);
let mut ctx = SESSION.create_execution_ctx();
let sliced = <BitPackedVTable as VTable>::execute_parent(
array,
&slice_array.into_array(),
0,
&mut ctx,
)
.expect("execute_parent failed")
.expect("expected slice kernel to execute");
let sliced = array.as_ref().slice(range).expect("slice failed");
sliced.as_::<BitPackedVTable>().clone()
}

Expand Down
7 changes: 5 additions & 2 deletions encodings/fastlanes/src/bitpacking/vtable/rules.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

use vortex_array::arrays::SliceReduceAdaptor;
use vortex_array::optimizer::rules::ParentRuleSet;
use vortex_array::scalar_fn::fns::cast::CastReduceAdaptor;

use crate::BitPackedVTable;

pub(crate) const RULES: ParentRuleSet<BitPackedVTable> =
ParentRuleSet::new(&[ParentRuleSet::lift(&CastReduceAdaptor(BitPackedVTable))]);
pub(crate) const RULES: ParentRuleSet<BitPackedVTable> = ParentRuleSet::new(&[
ParentRuleSet::lift(&CastReduceAdaptor(BitPackedVTable)),
ParentRuleSet::lift(&SliceReduceAdaptor(BitPackedVTable)),
]);
2 changes: 1 addition & 1 deletion vortex-cuda/gpu-scan-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ async fn main() -> VortexResult<()> {

// Create a full scan that executes on the GPU
let cuda_stream =
VortexCudaStreamPool::new(Arc::clone(cuda_ctx.stream().context()), 1).get_stream()?;
VortexCudaStreamPool::new(Arc::clone(cuda_ctx.stream().context()), 1).stream()?;
let gpu_reader = CopyDeviceReadAt::new(recompressed, cuda_stream);

let gpu_file = session
Expand Down
4 changes: 3 additions & 1 deletion vortex-cuda/kernels/src/dynamic_dispatch.cu
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,10 @@ __device__ inline void dynamic_source_op(const T *__restrict input,
constexpr uint32_t FL_CHUNK_SIZE = 1024;
constexpr uint32_t LANES_PER_FL_BLOCK = FL_CHUNK_SIZE / T_BITS;
const uint32_t bit_width = source_op.params.bitunpack.bit_width;
const uint32_t element_offset = source_op.params.bitunpack.element_offset;
const uint32_t packed_words_per_fl_block = LANES_PER_FL_BLOCK * bit_width;
const uint64_t first_fl_block = chunk_start / FL_CHUNK_SIZE;
// Shift chunk_start by the sub-block element offset.
const uint64_t first_fl_block = (chunk_start + element_offset) / FL_CHUNK_SIZE;

// FL blocks must divide evenly. Otherwise, the last unpack would overflow smem.
static_assert((ELEMENTS_PER_BLOCK % FL_CHUNK_SIZE) == 0);
Expand Down
3 changes: 3 additions & 0 deletions vortex-cuda/kernels/src/dynamic_dispatch.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,12 @@ union SourceParams {
/// Unpack bit-packed data using FastLanes layout.
struct BitunpackParams {
uint8_t bit_width;
uint8_t _padding[3];
uint32_t element_offset; // Element offset within FL block (0..1023)
} bitunpack;

/// Copy elements verbatim from global memory to shared memory.
/// The input pointer is pre-adjusted on the host to account for slicing.
struct LoadParams {
uint8_t _padding;
} load;
Expand Down
16 changes: 12 additions & 4 deletions vortex-cuda/src/device_buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,6 @@ mod private {
}
}

// Get it back out as a View of u8

impl CudaDeviceBuffer {
/// Creates a new CUDA device buffer from a [`CudaSlice<T>`].
///
Expand All @@ -101,6 +99,16 @@ impl CudaDeviceBuffer {
}
}

/// Returns the byte offset within the allocated buffer.
pub fn offset(&self) -> usize {
self.offset
}

/// Returns the adjusted device pointer accounting for the offset.
pub fn offset_ptr(&self) -> sys::CUdeviceptr {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

device_ptr is private, there's no public API anymore to get a device pointer without an offset.

self.device_ptr + self.offset as u64
}

/// Returns a [`CudaView`] to the CUDA device buffer.
pub fn as_view<T: DeviceRepr + 'static>(&self) -> CudaView<'_, T> {
// Return a new &[T]
Expand Down Expand Up @@ -159,7 +167,7 @@ impl CudaBufferExt for BufferHandle {
.as_any()
.downcast_ref::<CudaDeviceBuffer>()
.ok_or_else(|| vortex_err!("expected CudaDeviceBuffer"))?
.device_ptr;
.offset_ptr();

Ok(ptr)
}
Expand Down Expand Up @@ -281,7 +289,7 @@ impl DeviceBuffer for CudaDeviceBuffer {

/// Slices the CUDA device buffer to a subrange.
///
/// **IMPORTANT**: this is a byte range, not elements range, due to the DeviceBuffer interface.
/// This is a byte range, not elements range, due to the DeviceBuffer interface.
fn slice(&self, range: Range<usize>) -> Arc<dyn DeviceBuffer> {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In some way I think this fn shouldn't exist on the API. Device pointers should always be passed sliced to GPU, with the offset being applied to the device ptr. Encodings that don't support element wise slicing like bitpacking use an extra offset parameter to locate them in the FL block.

assert!(
range.end <= self.len,
Expand Down
Loading
Loading