-
Notifications
You must be signed in to change notification settings - Fork 135
feat[gpu]: slice support for CUDA dyn dispatch #6705
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,10 +1,13 @@ | ||
| // SPDX-License-Identifier: Apache-2.0 | ||
| // SPDX-FileCopyrightText: Copyright the Vortex contributors | ||
|
|
||
| use vortex_array::arrays::SliceReduceAdaptor; | ||
| use vortex_array::optimizer::rules::ParentRuleSet; | ||
| use vortex_array::scalar_fn::fns::cast::CastReduceAdaptor; | ||
|
|
||
| use crate::BitPackedVTable; | ||
|
|
||
| pub(crate) const RULES: ParentRuleSet<BitPackedVTable> = | ||
| ParentRuleSet::new(&[ParentRuleSet::lift(&CastReduceAdaptor(BitPackedVTable))]); | ||
| pub(crate) const RULES: ParentRuleSet<BitPackedVTable> = ParentRuleSet::new(&[ | ||
| ParentRuleSet::lift(&CastReduceAdaptor(BitPackedVTable)), | ||
| ParentRuleSet::lift(&SliceReduceAdaptor(BitPackedVTable)), | ||
| ]); |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -81,8 +81,6 @@ mod private { | |
| } | ||
| } | ||
|
|
||
| // Get it back out as a View of u8 | ||
|
|
||
| impl CudaDeviceBuffer { | ||
| /// Creates a new CUDA device buffer from a [`CudaSlice<T>`]. | ||
| /// | ||
|
|
@@ -101,6 +99,16 @@ impl CudaDeviceBuffer { | |
| } | ||
| } | ||
|
|
||
| /// Returns the byte offset within the allocated buffer. | ||
| pub fn offset(&self) -> usize { | ||
| self.offset | ||
| } | ||
|
|
||
| /// Returns the adjusted device pointer accounting for the offset. | ||
| pub fn offset_ptr(&self) -> sys::CUdeviceptr { | ||
| self.device_ptr + self.offset as u64 | ||
| } | ||
|
|
||
| /// Returns a [`CudaView`] to the CUDA device buffer. | ||
| pub fn as_view<T: DeviceRepr + 'static>(&self) -> CudaView<'_, T> { | ||
| // Return a new &[T] | ||
|
|
@@ -159,7 +167,7 @@ impl CudaBufferExt for BufferHandle { | |
| .as_any() | ||
| .downcast_ref::<CudaDeviceBuffer>() | ||
| .ok_or_else(|| vortex_err!("expected CudaDeviceBuffer"))? | ||
| .device_ptr; | ||
| .offset_ptr(); | ||
|
|
||
| Ok(ptr) | ||
| } | ||
|
|
@@ -281,7 +289,7 @@ impl DeviceBuffer for CudaDeviceBuffer { | |
|
|
||
| /// Slices the CUDA device buffer to a subrange. | ||
| /// | ||
| /// **IMPORTANT**: this is a byte range, not elements range, due to the DeviceBuffer interface. | ||
| /// This is a byte range, not elements range, due to the DeviceBuffer interface. | ||
| fn slice(&self, range: Range<usize>) -> Arc<dyn DeviceBuffer> { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In some way I think this fn shouldn't exist on the API. Device pointers should always be passed sliced to GPU, with the offset being applied to the device ptr. Encodings that don't support element wise slicing like bitpacking use an extra offset parameter to locate them in the FL block. |
||
| assert!( | ||
| range.end <= self.len, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
device_ptris private, there's no public API anymore to get a device pointer without an offset.