Formatting

lukekim · lukekim · commit beeadf94cc9b · 2025-12-22T16:17:39.000-08:00
diff --git a/candle-core/src/cpu_backend/mod.rs b/candle-core/src/cpu_backend/mod.rs
@@ -2607,7 +2607,7 @@ impl BackendStorage for CpuStorage {
             let kernel_l = Layout::contiguous_with_offset((1, n, k), kernel_l.start_offset())
                 .transpose(1, 2)?
                 .broadcast_as((b, k, n))?;
-            col.matmul(kernel, (b, m, n, k), &col_l, &kernel_l)?
+            col.matmul_with_alpha(kernel, None, (b, m, n, k), &col_l, &kernel_l)?
         } else {
             // Make the kernel contiguous if not already the case.
             let mut kernel_c = unsafe {
@@ -2618,7 +2618,7 @@ impl BackendStorage for CpuStorage {
             let kernel_l = Layout::contiguous_with_offset((1, n, k), kernel_l.start_offset())
                 .transpose(1, 2)?
                 .broadcast_as((b, k, n))?;
-            col.matmul(kernel, (b, m, n, k), &col_l, &kernel_l)?
+            col.matmul_with_alpha(kernel, None, (b, m, n, k), &col_l, &kernel_l)?
         };
         let res_l = Layout::contiguous((b, l_out, params.c_out)).transpose(1, 2)?;
         let mut res_t = unsafe { self.device().alloc_uninit(res_l.shape(), res.dtype())? };
@@ -2659,8 +2659,9 @@ impl BackendStorage for CpuStorage {
                     vec![0, k_size * c_out, 1],
                     kernel_l.start_offset(),
                 );
-                self.matmul(
+                self.matmul_with_alpha(
                     kernel,
+                    None,
                     (
                         b_size,
                         /* m */ l_in,
@@ -3144,11 +3145,6 @@ impl BackendDevice for CpuDevice {
         Ok(storage)
     }
 
-    fn get_current_seed(&self) -> Result<u64> {
-        // CPU backend doesn't maintain a seed state
-        Ok(0)
-    }
-
     fn synchronize(&self) -> Result<()> {
         Ok(())
     }
diff --git a/candle-core/src/storage.rs b/candle-core/src/storage.rs
@@ -738,6 +738,7 @@ impl Storage {
         }
     }
 
+    #[allow(dead_code)]
     #[allow(clippy::too_many_arguments)]
     pub(crate) fn matmul_with_alpha_beta(
         &self,
diff --git a/candle-core/src/tensor.rs b/candle-core/src/tensor.rs
@@ -1425,6 +1425,122 @@ impl Tensor {
         Ok(from_storage(storage, c_shape, op, false))
     }
 
+    /// Matrix-multiplication with a scalar multiplier (alpha).
+    ///
+    /// Computes `alpha * (self @ rhs)` where `@` represents matrix multiplication.
+    /// If `alpha` is `None`, it defaults to 1.0.
+    ///
+    /// # Arguments
+    ///
+    /// * `rhs` - The right-hand side matrix.
+    /// * `alpha` - Optional scalar multiplier applied to the result.
+    pub fn matmul_with_alpha(&self, rhs: &Self, alpha: Option<f64>) -> Result<Self> {
+        let a_dims = self.shape().dims();
+        let b_dims = rhs.shape().dims();
+
+        let dim = a_dims.len();
+
+        if dim < 2 || b_dims.len() != dim {
+            Err(Error::ShapeMismatchBinaryOp {
+                lhs: self.shape().clone(),
+                rhs: rhs.shape().clone(),
+                op: "matmul_with_alpha",
+            }
+            .bt())?
+        }
+
+        let m = a_dims[dim - 2];
+        let k = a_dims[dim - 1];
+        let k2 = b_dims[dim - 2];
+        let n = b_dims[dim - 1];
+
+        let c_shape = Shape::from(&a_dims[..dim - 2]).extend(&[m, n]);
+        if c_shape.elem_count() == 0 || k == 0 {
+            return Tensor::zeros(c_shape, self.dtype(), self.device());
+        }
+        let batching: usize = a_dims[..dim - 2].iter().product();
+        let batching_b: usize = b_dims[..dim - 2].iter().product();
+        if k != k2 || batching != batching_b {
+            Err(Error::ShapeMismatchBinaryOp {
+                lhs: self.shape().clone(),
+                rhs: rhs.shape().clone(),
+                op: "matmul_with_alpha",
+            }
+            .bt())?
+        }
+
+        let storage = self.storage().matmul_with_alpha(
+            &rhs.storage(),
+            alpha,
+            (batching, m, n, k),
+            self.layout(),
+            rhs.layout(),
+        )?;
+        // Note: No backprop for alpha-scaled matmul for now
+        let op = BackpropOp::none();
+        Ok(from_storage(storage, c_shape, op, false))
+    }
+
+    /// Matrix-multiplication with alpha and beta scaling, using a mutable output tensor.
+    ///
+    /// Computes `c = alpha * (self @ rhs) + beta * c` where `@` represents matrix multiplication.
+    /// This is an in-place operation that modifies `c`.
+    /// If `alpha` is `None`, it defaults to 1.0. Beta is implicitly 1.0.
+    ///
+    /// # Arguments
+    ///
+    /// * `rhs` - The right-hand side matrix.
+    /// * `c` - The mutable output tensor that will be modified in-place.
+    /// * `alpha` - Optional scalar multiplier applied to the matmul result.
+    pub fn matmul_with_alpha_beta(
+        &self,
+        rhs: &Self,
+        c: &mut Self,
+        alpha: Option<f64>,
+    ) -> Result<()> {
+        let a_dims = self.shape().dims();
+        let b_dims = rhs.shape().dims();
+        let c_dims = c.shape().dims();
+
+        let dim = a_dims.len();
+
+        if dim < 2 || b_dims.len() != dim || c_dims.len() != dim {
+            Err(Error::ShapeMismatchBinaryOp {
+                lhs: self.shape().clone(),
+                rhs: rhs.shape().clone(),
+                op: "matmul_with_alpha_beta",
+            }
+            .bt())?
+        }
+
+        let m = a_dims[dim - 2];
+        let k = a_dims[dim - 1];
+        let k2 = b_dims[dim - 2];
+        let n = b_dims[dim - 1];
+
+        let batching: usize = a_dims[..dim - 2].iter().product();
+        let batching_b: usize = b_dims[..dim - 2].iter().product();
+        if k != k2 || batching != batching_b {
+            Err(Error::ShapeMismatchBinaryOp {
+                lhs: self.shape().clone(),
+                rhs: rhs.shape().clone(),
+                op: "matmul_with_alpha_beta",
+            }
+            .bt())?
+        }
+
+        self.storage().matmul_with_alpha_beta(
+            &rhs.storage(),
+            &mut c.storage_mut(),
+            alpha,
+            (batching, m, n, k),
+            self.layout(),
+            rhs.layout(),
+            c.layout(),
+        )?;
+        Ok(())
+    }
+
     /// Matrix-multiplication with broadcasting support.
     ///
     /// Compared to `matmul` the two matrixes are allowed to have different dimensions as long as
diff --git a/candle-core/src/tensor_indexing.rs b/candle-core/src/tensor_indexing.rs
@@ -5,7 +5,7 @@ use crate::{
     op::{BackpropOp, Op},
     shape::Dim,
     tensor::from_storage,
-    DType, Error, Result, Tensor,
+    DType, Error, Layout, Result, Tensor,
 };
 
 /// Specialization of `std::ops::RangeBounds` for `usize` to allow trait objects.
@@ -171,8 +171,13 @@ impl Tensor {
             }
             .bt())?
         }
-        let storage = self.storage().scatter_add(
-            self.layout(),
+        let shape = self.shape();
+        let mut storage = unsafe { self.device().alloc_uninit(shape, self.dtype())? };
+        self.storage()
+            .copy_strided_src(&mut storage, 0, self.layout())?;
+        let layout = Layout::contiguous(shape);
+        storage.scatter_add(
+            &layout,
             &indexes.storage(),
             indexes.layout(),
             &source.storage(),
diff --git a/candle-nn/benches/benchmarks/attention.rs b/candle-nn/benches/benchmarks/attention.rs
@@ -1,7 +1,8 @@
 use crate::benchmarks::{BenchDevice, BenchDeviceHandler};
 use candle::{DType, Device, Tensor};
 use candle_nn::scaled_dot_product_attention;
-use criterion::{black_box, criterion_group, Criterion, Throughput};
+use criterion::{criterion_group, Criterion, Throughput};
+use std::hint::black_box;
 use std::time::Instant;
 
 fn run_attention(q: &Tensor, k: &Tensor, v: &Tensor, m: &Tensor, s: f64) {
diff --git a/candle-nn/src/layer_norm.rs b/candle-nn/src/layer_norm.rs
@@ -351,3 +351,13 @@ pub fn rms_norm_quant(
         _ghost: PhantomData,
     })
 }
+
+/// Create an RmsNorm layer (non-quantized version).
+/// This is an alias for `rms_norm_non_quant` for convenience.
+pub fn rms_norm(
+    size: usize,
+    eps: f64,
+    vb: crate::VarBuilder,
+) -> Result<RmsNorm<RmsNormNonQuantized>> {
+    rms_norm_non_quant(size, eps, vb)
+}
diff --git a/candle-nn/src/lib.rs b/candle-nn/src/lib.rs
@@ -52,11 +52,11 @@ pub use func::{func, func_t, Func, FuncT};
 pub use group_norm::{group_norm, GroupNorm};
 pub use init::Init;
 pub use layer_norm::{
-    layer_norm, layer_norm_no_bias, rms_norm_non_quant, rms_norm_quant, LayerNorm, LayerNormConfig,
-    RmsNorm,
+    layer_norm, layer_norm_no_bias, rms_norm, rms_norm_non_quant, rms_norm_quant, LayerNorm,
+    LayerNormConfig, RmsNorm,
 };
 pub use linear::{linear, linear_b, linear_no_bias, Linear};
-pub use ops::{kvconcat, Dropout};
+pub use ops::Dropout;
 pub use optim::{AdamW, Optimizer, ParamsAdamW, SGD};
 pub use rnn::{gru, lstm, GRUConfig, LSTMConfig, GRU, LSTM, RNN};
 pub use rope::RotaryEmbedding;
diff --git a/candle-nn/src/var_builder.rs b/candle-nn/src/var_builder.rs
@@ -317,32 +317,6 @@ impl SimpleBackend for HashMap<String, Tensor> {
         tensor.to_device(dev)?.to_dtype(dtype)
     }
 
-    fn get_unchecked(&self, name: &str, dtype: DType, dev: &Device) -> Result<Tensor> {
-        let tensor = self
-            .get(name)
-            .ok_or_else(|| {
-                Error::CannotFindTensor {
-                    path: name.to_string(),
-                }
-                .bt()
-            })?
-            .clone();
-        tensor.to_device(dev)?.to_dtype(dtype)
-    }
-
-    fn get_unchecked(&self, name: &str, dtype: DType, dev: &Device) -> Result<Tensor> {
-        let tensor = self
-            .get(name)
-            .ok_or_else(|| {
-                Error::CannotFindTensor {
-                    path: name.to_string(),
-                }
-                .bt()
-            })?
-            .clone();
-        tensor.to_device(dev)?.to_dtype(dtype)
-    }
-
     fn contains_tensor(&self, name: &str) -> bool {
         self.contains_key(name)
     }
diff --git a/candle-nn/tests/ops.rs b/candle-nn/tests/ops.rs
@@ -5,7 +5,6 @@ extern crate intel_mkl_src;
 extern crate accelerate_src;
 
 use candle::{test_device, test_utils::to_vec3_round, Device, IndexOp, Result, Tensor};
-use candle_nn::Activation;
 
 fn softmax(device: &Device) -> Result<()> {
     let data = &[[[3f32, 1., 4.], [1., 5., 9.]], [[2., 1., 7.], [8., 2., 8.]]];
@@ -53,22 +52,6 @@ fn softmax(device: &Device) -> Result<()> {
     Ok(())
 }
 
-fn inplace_softmax(device: &Device) -> Result<()> {
-    let data = &[[[3f32, 1., 4.], [1., 5., 9.]], [[2., 1., 7.], [8., 2., 8.]]];
-    let mut tensor = Tensor::new(data, device)?.log()?;
-    candle_nn::ops::inplace_softmax_last_dim(&mut tensor)?;
-    assert_eq!(
-        to_vec3_round(&tensor, 4)?,
-        &[
-            // (3, 1, 4) / 8, (1, 5, 9) / 15
-            [[0.375, 0.125, 0.5], [0.0667, 0.3333, 0.6]],
-            // (2, 1, 7) / 10, (8, 2, 8) / 18
-            [[0.2, 0.1, 0.7], [0.4444, 0.1111, 0.4444]]
-        ]
-    );
-    Ok(())
-}
-
 fn rms_norm(device: &Device) -> Result<()> {
     let data = &[[[3f32, 1., 4.], [1., 5., 9.]], [[2., 1., 7.], [8., 2., 8.]]];
     let tensor = Tensor::new(data, device)?;
@@ -341,44 +324,12 @@ fn sigmoid(device: &Device) -> Result<()> {
     Ok(())
 }
 
-fn mul_and_act(device: &Device) -> Result<()> {
-    let data = &[[[3f32, 1., 4.], [1., 5., 9.]], [[2., 1., 7.], [8., 2., 8.]]];
-    let cpu = Tensor::new(data, &Device::Cpu)?;
-    let x = Tensor::new(data, device)?;
-
-    for act in [Activation::Gelu, Activation::Relu, Activation::Silu] {
-        let truth = candle_nn::ops::mul_and_act(&cpu, &cpu, act)?;
-        let test = candle_nn::ops::mul_and_act(&x, &x, act)?.to_device(&Device::Cpu)?;
-
-        let sum_diff = (truth - test)?.abs()?.sum_all()?.to_vec0::<f32>()?;
-        if device.is_cpu() {
-            assert_eq!(sum_diff, 0., "act = {act:?}");
-        } else {
-            assert!(sum_diff < 3e-3, "act = {act:?}");
-        }
-    }
-
-    Ok(())
-}
-
 test_device!(ropei, ropei_cpu, ropei_gpu, ropei_metal);
 test_device!(rope, rope_cpu, rope_gpu, rope_metal);
 test_device!(rope_thd, rope_thd_cpu, rope_thd_gpu, rope_thd_metal);
 test_device!(softmax, softmax_cpu, softmax_gpu, softmax_metal);
-test_device!(
-    inplace_softmax,
-    inplace_softmax_cpu,
-    inplace_softmax_gpu,
-    inplace_softmax_metal
-);
 test_device!(rms_norm, rms_norm_cpu, rms_norm_gpu, rms_norm_metal);
 test_device!(rms_norml, rms_norml_cpu, rms_norml_gpu, rms_norml_metal);
 test_device!(layer_norm, ln_cpu, ln_gpu, ln_metal);
 test_device!(layer_norml, lnl_cpu, lnl_gpu, lnl_metal);
 test_device!(sigmoid, sigmoid_cpu, sigmoid_gpu, sigmoid_metal);
-test_device!(
-    mul_and_act,
-    mul_and_act_cpu,
-    mul_and_act_gpu,
-    mul_and_act_metal
-);
diff --git a/candle-pyo3/src/lib.rs b/candle-pyo3/src/lib.rs
@@ -17,7 +17,7 @@ extern crate intel_mkl_src;
 #[cfg(feature = "accelerate")]
 extern crate accelerate_src;
 
-use candle::{quantized::QTensor, DType, Device, Module, Tensor, WithDType};
+use ::candle::{quantized::QTensor, DType, Device, Module, Tensor, WithDType};
 
 mod utils;
 use utils::wrap_err;
@@ -217,12 +217,6 @@ trait MapDType {
             DType::F16 => self.f::<f16>(t),
             DType::F32 => self.f::<f32>(t),
             DType::F64 => self.f::<f64>(t),
-            DType::I16 => Err(PyErr::new::<PyTypeError, _>(
-                "i16 dtype is not supported in Python interface",
-            )),
-            DType::I32 => Err(PyErr::new::<PyTypeError, _>(
-                "i32 dtype is not supported in Python interface",
-            )),
             DType::F8E4M3 => Err(PyErr::new::<PyTypeError, _>(
                 "f8e4m3 dtype is not supported in Python interface",
             )),
@@ -1104,7 +1098,7 @@ impl PyTensor {
     /// Quantize the tensor.
     /// &RETURNS&: QTensor
     fn quantize(&self, quantized_dtype: &str) -> PyResult<PyQTensor> {
-        use candle::quantized;
+        use ::candle::quantized;
         let res = match quantized_dtype.to_lowercase().as_str() {
             "q2k" => quantized::QTensor::quantize(self, quantized::GgmlDType::Q2K),
             "q3k" => quantized::QTensor::quantize(self, quantized::GgmlDType::Q3K),

Original file line number	Diff line number	Diff line change
`@@ -738,6 +738,7 @@ impl Storage {`
`738`	`738`	`}`
`739`	`739`	`}`
`740`	`740`
	`741`	`+ #[allow(dead_code)]`
`741`	`742`	`#[allow(clippy::too_many_arguments)]`
`742`	`743`	`pub(crate) fn matmul_with_alpha_beta(`
`743`	`744`	`&self,`