-
Notifications
You must be signed in to change notification settings - Fork 1.3k
Add more misc. changes from candle fork #3196
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
178987a
d4dab0c
0ee2bc8
bc9030c
fd2b563
00689f5
60e297a
dc80e40
15591ff
5d1dbd6
a372a14
1bb1c93
cb4a042
bdb66f2
2536e75
d21b0a7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -65,6 +65,12 @@ pub const RESOURCE_OPTIONS: MTLResourceOptions = | |
| //| MTLResourceOptions::HazardTrackingModeUntracked.bits(), | ||
| //); | ||
|
|
||
| // Resource options used for `new_private_buffer`. This uses `private` where supported. | ||
| #[cfg(target_os = "ios")] | ||
| pub const PRIVATE_RESOURCE_OPTIONS: MTLResourceOptions = MTLResourceOptions::StorageModeShared; | ||
| #[cfg(not(target_os = "ios"))] | ||
| pub const PRIVATE_RESOURCE_OPTIONS: MTLResourceOptions = MTLResourceOptions::StorageModePrivate; | ||
|
|
||
| impl std::fmt::Debug for MetalDevice { | ||
| fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { | ||
| write!(f, "MetalDevice({:?})", self.id) | ||
|
|
@@ -167,6 +173,23 @@ impl MetalDevice { | |
| self.allocate_buffer(size) | ||
| } | ||
|
|
||
| /// Creates a new private buffer (not necessarily zeroed). | ||
| /// | ||
| /// This is intentionally not in the Metal buffer pool to allow the efficient implementation of persistent buffers. | ||
| pub fn new_private_buffer( | ||
|
Comment on lines
+176
to
+179
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree that this is nice to have, but I think we should name it something other than private buffer since that already means something for metal buffers (only available on gpu, ref). How about
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, this was a mistake on my part. The correct behavior that I intended for this function is to have:
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see. Could I ask why you want it to be private? |
||
| &self, | ||
| element_count: usize, | ||
| dtype: DType, | ||
| _name: &str, | ||
| ) -> Result<Arc<Buffer>> { | ||
| let size = element_count * dtype.size_in_bytes(); | ||
| let buffer = self | ||
| .device | ||
| .new_buffer(size, PRIVATE_RESOURCE_OPTIONS) | ||
| .map_err(MetalError::from)?; | ||
| Ok(Arc::new(buffer)) | ||
| } | ||
|
|
||
| /// Creates a new buffer from data. | ||
| /// | ||
| /// Does not require synchronization, as [newBufferWithBytes](https://developer.apple.com/documentation/metal/mtldevice/1433429-newbufferwithbytes) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -406,7 +406,125 @@ fn mul_mat_via_q8_1( | |
| Ok(CudaStorage::wrap_cuda_slice(dst, dev.clone())) | ||
| } | ||
|
|
||
| fn indexed_moe_forward_fused_q8_1_input( | ||
| weight: &CudaView<u8>, | ||
| w_shape: &crate::Shape, //[num_experts, n, k] | ||
| w_dtype: GgmlDType, | ||
| input: &CudaSlice<f32>, | ||
| in_shape: &crate::Shape, //[batch, topk or 1, k] | ||
| ids: &CudaView<u32>, | ||
| idx_shape: &crate::Shape, //[batch, topk] | ||
| dev: &CudaDevice, | ||
| ) -> Result<(CudaStorage, crate::Shape)> { | ||
| let (_, n, k) = w_shape.dims3()?; | ||
| let batch = in_shape.dims()[0]; | ||
| let input_dim1 = in_shape.dims()[1]; | ||
|
|
||
| let topk = idx_shape.dims()[1]; | ||
| assert!(batch == idx_shape.dims()[0], "batch dim not match!"); | ||
|
|
||
| // Quantize input into q8_1. | ||
| let total_rows = batch * input_dim1; | ||
| let k_padded = pad(k, MATRIX_ROW_PADDING); | ||
| // Get Q8_1 metadata. | ||
| let q8_1_block_size = GgmlDType::Q8_1.block_size(); | ||
| let q8_1_type_size = GgmlDType::Q8_1.type_size(); | ||
|
|
||
| // Calculate the size of the output buffer in bytes. | ||
| let num_blocks_per_row = k_padded / q8_1_block_size; | ||
| let dst_row_size_bytes = num_blocks_per_row * q8_1_type_size; | ||
| let y_size_in_bytes = total_rows * dst_row_size_bytes; | ||
| let mut input_quant = unsafe { dev.alloc::<u8>(y_size_in_bytes)? }; | ||
|
|
||
| let input_view = input.slice(0..); | ||
| quantize_q8_1(&input_view, &mut input_quant, k, total_rows, dev)?; | ||
|
|
||
| // output buffer | ||
| let outsize = batch * topk * n; | ||
| let out = unsafe { dev.alloc::<f32>(outsize)? }; | ||
|
|
||
| let kernel_name = match w_dtype { | ||
| GgmlDType::Q2K => "indexed_moe_forward_q2k_q8_1", | ||
| GgmlDType::Q3K => "indexed_moe_forward_q3k_q8_1", | ||
| GgmlDType::Q4K => "indexed_moe_forward_q4k_q8_1", | ||
| GgmlDType::Q5K => "indexed_moe_forward_q5k_q8_1", | ||
| GgmlDType::Q6K => "indexed_moe_forward_q6k_q8_1", | ||
| GgmlDType::Q8_0 => "indexed_moe_forward_q8_0_q8_1", | ||
| _ => crate::bail!("unsupported dtype for indexed_moe_forward {w_dtype:?}"), | ||
| }; | ||
| let func = dev.get_or_load_func(kernel_name, &candle_kernels::QUANTIZED)?; | ||
| let (nblocks, nwarps) = (n as u32, 4); | ||
| let cfg = cudarc::driver::LaunchConfig { | ||
| grid_dim: (nblocks, batch as u32, topk as u32), | ||
| block_dim: (WARP_SIZE as u32, nwarps, 1), | ||
| shared_mem_bytes: 0, | ||
| }; | ||
|
|
||
| let mut builder = func.builder(); | ||
| builder.arg(weight); | ||
| builder.arg(&input_quant); | ||
| builder.arg(ids); | ||
| builder.arg(&out); | ||
|
|
||
| barg!( | ||
| builder, | ||
| n as i32, | ||
| k as i32, | ||
| batch as i32, | ||
| topk as i32, | ||
| k_padded as i32, | ||
| input_dim1 as i32 | ||
| ); | ||
| unsafe { builder.launch(cfg) }.w()?; | ||
|
|
||
| let mut out_shape = in_shape.dims().to_vec(); | ||
| out_shape.pop(); | ||
| out_shape.push(n); | ||
| out_shape[1] = topk; | ||
| Ok(( | ||
| CudaStorage::wrap_cuda_slice(out, dev.clone()), | ||
| out_shape.into(), | ||
| )) | ||
| } | ||
|
|
||
| impl QCudaStorage { | ||
| pub fn indexed_moe_forward( | ||
| &self, | ||
| self_shape: &crate::Shape, //[num_experts, n, k] | ||
| input: &CudaStorage, //[batch, topk or 1, k] | ||
| input_l: &crate::Layout, | ||
| ids: &CudaStorage, //[batch, topk] | ||
| ids_l: &crate::Layout, | ||
| ) -> Result<(CudaStorage, crate::Shape)> { | ||
| if matches!( | ||
| self.dtype(), | ||
| GgmlDType::Q8_0 | ||
| | GgmlDType::Q2K | ||
| | GgmlDType::Q3K | ||
| | GgmlDType::Q4K | ||
| | GgmlDType::Q5K | ||
| | GgmlDType::Q6K | ||
| ) { | ||
| let input_storage = input.as_cuda_slice::<f32>()?; | ||
| let ids_storage = ids.as_cuda_slice::<u32>()?; | ||
| indexed_moe_forward_fused_q8_1_input( | ||
| &self.data.inner.slice(0..), | ||
| self_shape, //[num_experts, n, k] | ||
| self.dtype(), | ||
| &input_storage, | ||
| input_l.shape(), //[batch, topk or 1, k] | ||
| &ids_storage.slice(0..), | ||
| ids_l.shape(), //[batch, topk] | ||
| &self.device, | ||
| ) | ||
| } else { | ||
| crate::bail!( | ||
| "The given quantized dtype {:?} is not supported for indexed_moe_forward!", | ||
| self.dtype() | ||
| ); | ||
|
Comment on lines
+521
to
+524
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just thinking out loud here. It would be nice to have automatic fallback to an approach that isn't as optimized, but still valid. Perhaps returning Not thinking we add this in this PR ofc.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This might work, the issue is that effectively Regardless, providing a grouped gemm functionality will be very useful! |
||
| } | ||
| } | ||
|
|
||
| pub fn zeros(device: &CudaDevice, el_count: usize, dtype: GgmlDType) -> Result<Self> { | ||
| let size_in_bytes = ceil_div(el_count, dtype.block_size()) * dtype.type_size(); | ||
| let padded_size_in_bytes = | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.