spiceai
diff --git a/‎.gitmodules‎
Lines changed: 6 additions & 0 deletions b/‎.gitmodules‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎.vscode/settings.json‎
Lines changed: 0 additions & 11 deletions b/‎.vscode/settings.json‎
Lines changed: 0 additions & 11 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 9 additions & 7 deletions b/‎Cargo.toml‎
Lines changed: 9 additions & 7 deletions
diff --git a/‎README.md‎
Lines changed: 2 additions & 0 deletions b/‎README.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎candle-core/Cargo.toml‎
Lines changed: 4 additions & 7 deletions b/‎candle-core/Cargo.toml‎
Lines changed: 4 additions & 7 deletions
diff --git a/‎candle-core/benches/benchmarks/mod.rs‎
Lines changed: 5 additions & 4 deletions b/‎candle-core/benches/benchmarks/mod.rs‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎candle-core/benches/benchmarks/where_cond.rs‎
Lines changed: 1 addition & 1 deletion b/‎candle-core/benches/benchmarks/where_cond.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎candle-core/src/backend.rs‎
Lines changed: 16 additions & 1 deletion b/‎candle-core/src/backend.rs‎
Lines changed: 16 additions & 1 deletion
diff --git a/‎candle-core/src/convert.rs‎
Lines changed: 16 additions & 0 deletions b/‎candle-core/src/convert.rs‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎candle-core/src/cpu/avx.rs‎
Lines changed: 81 additions & 2 deletions b/‎candle-core/src/cpu/avx.rs‎
Lines changed: 81 additions & 2 deletions
@@ -1,3 +1,9 @@
 [submodule "candle-examples/examples/flash-attn/cutlass"]
 	path = candle-flash-attn/cutlass
 	url = https://github.com/NVIDIA/cutlass.git
+[submodule "candle-flash-attn-v3/cutlass"]
+	url = https://github.com/NVIDIA/cutlass.git
+	path = candle-flash-attn-v3/cutlass
+[submodule "candle-flash-mla/cutlass"]
+	path = candle-flash-mla/cutlass
+	url = https://github.com/NVIDIA/cutlass
@@ -13,9 +13,11 @@ members = [
 exclude = [
    "candle-book",
    "candle-flash-attn",
+   "candle-flash-attn-v3",
    "candle-kernels",
    "candle-metal-kernels",
    "candle-onnx",
+   "candle-flash-mla",
 ]
 resolver = "2"
 
@@ -36,18 +38,21 @@ byteorder = "1.4.3"
 candle = { path = "./candle-core", package = "candle-core", version = "0.9.1" }
 candle-datasets = { path = "./candle-datasets", version = "0.9.1" }
 candle-flash-attn = { path = "./candle-flash-attn", version = "0.9.1" }
+candle-flash-attn-v3 = { path = "./candle-flash-attn-v3", version = "0.9.1" }
+candle-flash-mla = { path = "./candle-flash-mla", version = "0.9.1" }
 candle-kernels = { path = "./candle-kernels", version = "0.9.1" }
 candle-metal-kernels = { path = "./candle-metal-kernels", version = "0.9.1" }
 candle-nn = { path = "./candle-nn", version = "0.9.1" }
 candle-onnx = { path = "./candle-onnx", version = "0.9.1" }
 candle-transformers = { path = "./candle-transformers", version = "0.9.1" }
 clap = { version = "4.2.4", features = ["derive"] }
 criterion = { version = "0.5.1", default-features=false }
-cudarc = { version = "0.16.1", features = ["std", "cublas", "cublaslt", "curand", "driver", "nvrtc", "f16", "cuda-version-from-build-system", "dynamic-linking"], default-features=false }
+cudarc = { version = "0.13.3", features = ["std", "cublas", "cublaslt", "curand", "driver", "nvrtc", "f16", "cuda-version-from-build-system", "dynamic-linking"], default-features=false }
 fancy-regex = "0.13.0"
 gemm = { version = "0.17.0", features = ["wasm-simd128-enable"] }
-hf-hub = "0.4.1"
-half = { version = "2.5.0", features = ["num-traits", "use-intrinsics", "rand_distr"] }
+hf-hub = { version = "0.3.3", package = "candle-hf-hub" }
+half = { version = "2.3.1", features = ["num-traits", "use-intrinsics", "rand_distr"] }
+float8 = { version = "0.2.0", features = ["num-traits", "rand_distr"] }
 hound = "3.5.1"
 image = { version = "0.25.2", default-features = false, features = ["jpeg", "png"] }
 imageproc = { version = "0.24.0", default-features = false }
@@ -59,7 +64,7 @@ num_cpus = "1.15.0"
 num-traits = "0.2.15"
 parquet = { version = "51.0.0" }
 rand = "0.9.0"
-rand_distr = "0.5.1"
+rand_distr = "0.5"
 rayon = "1.7.0"
 safetensors = "0.4.1"
 serde = { version = "1.0.171", features = ["derive"] }
@@ -70,9 +75,6 @@ tokenizers = { version = "0.21.0", default-features = false }
 tracing = "0.1.37"
 tracing-chrome = "0.7.1"
 tracing-subscriber = "0.3.7"
-ug = "0.4.0"
-ug-cuda = "0.4.0"
-ug-metal = "0.4.0"
 yoke = { version = "0.7.2", features = ["derive"] }
 zip = { version = "1.1.1", default-features = false }
 metal = { version = "0.27.0", features = ["mps"]}
 
@@ -5,6 +5,8 @@
 [![License](https://img.shields.io/github/license/base-org/node?color=blue)](https://github.com/huggingface/candle/blob/main/LICENSE-MIT)
 [![License](https://img.shields.io/badge/license-Apache%202.0-blue?style=flat-square)](https://github.com/huggingface/candle/blob/main/LICENSE-APACHE)
 
+**This is an optimized implmentation by Eric Buehler.**
+
 Candle is a minimalist ML framework for Rust with a focus on performance (including GPU support) 
 and ease of use. Try our online demos: 
 [whisper](https://huggingface.co/spaces/lmz/candle-whisper),
 
@@ -18,6 +18,7 @@ metal = { workspace = true, optional = true }
 cudarc = { workspace = true, optional = true }
 gemm = { workspace = true }
 half = { workspace = true }
+float8 = { workspace = true }
 intel-mkl-src = { workspace = true, optional = true }
 libc = { workspace = true, optional = true }
 memmap2 = { workspace = true }
@@ -28,26 +29,22 @@ rand_distr = { workspace = true }
 rayon = { workspace = true }
 safetensors = { workspace = true }
 thiserror = { workspace = true }
-ug-cuda = { workspace = true, optional = true }
-ug-metal = { workspace = true, optional = true }
 yoke = { workspace = true }
 zip = { workspace = true }
 
-[target.'cfg(not(target_arch = "wasm32"))'.dependencies]
-ug = { workspace = true }
-
 [dev-dependencies]
 anyhow = { workspace = true }
 clap = { workspace = true }
 criterion = { workspace = true }
 
 [features]
 default = []
-cuda = ["cudarc", "dep:candle-kernels", "dep:ug-cuda"]
+cuda = ["cudarc", "dep:candle-kernels", "float8/cuda"]
 cudnn = ["cuda", "cudarc/cudnn"]
+nccl = ["cuda", "cudarc/nccl"]
 mkl = ["dep:libc", "dep:intel-mkl-src"]
 accelerate = ["dep:libc", "dep:accelerate-src"]
-metal = ["dep:metal", "dep:candle-metal-kernels", "dep:ug-metal"]
+metal = ["dep:metal", "dep:candle-metal-kernels"]
 
 [[bench]]
 name = "bench_main"
 
@@ -22,15 +22,16 @@ impl BenchDevice for Device {
             Device::Cpu => Ok(()),
             Device::Cuda(device) => {
                 #[cfg(feature = "cuda")]
-                return Ok(device
-                    .synchronize()
-                    .map_err(|e| candle_core::Error::Cuda(Box::new(e)))?);
+                {
+                    use candle_core::cuda::WrapErr;
+                    return Ok(device.synchronize().w()?);
+                }
                 #[cfg(not(feature = "cuda"))]
                 panic!("Cuda device without cuda feature enabled: {:?}", device)
             }
             Device::Metal(device) => {
                 #[cfg(feature = "metal")]
-                return Ok(device.wait_until_completed()?);
+                return device.wait_until_completed();
                 #[cfg(not(feature = "metal"))]
                 panic!("Metal device without metal feature enabled: {:?}", device)
             }
 
@@ -22,7 +22,7 @@ const M: usize = 1024;
 const K: usize = 1024;
 const SIZE: usize = B * M * K;
 
-const DATA: [u8; SIZE] = create_cond_arr::<SIZE>();
+static DATA: [u8; SIZE] = create_cond_arr::<SIZE>();
 
 fn run_where_cond_benchmark(c: &mut Criterion, device: &Device, dtype: DType, name: &str) {
     let tensor = Tensor::from_slice(DATA.as_slice(), (B, M, K), device).unwrap();
 
@@ -103,9 +103,23 @@ pub trait BackendStorage: Sized {
         _: usize,
     ) -> Result<Self>;
 
-    fn matmul(
+    #[allow(clippy::too_many_arguments)]
+    fn matmul_with_alpha_beta(
+        &self,
+        _: &Self,
+        _: &mut Self,
+        _: Option<f64>,
+        _: (usize, usize, usize, usize),
+        _: &Layout,
+        _: &Layout,
+        _: &Layout,
+    ) -> Result<()>;
+
+    #[allow(clippy::too_many_arguments)]
+    fn matmul_with_alpha(
         &self,
         _: &Self,
+        _: Option<f64>,
         _: (usize, usize, usize, usize),
         _: &Layout,
         _: &Layout,
@@ -158,6 +172,7 @@ pub trait BackendDevice: Sized + std::fmt::Debug + Clone {
     fn rand_normal(&self, _: &Shape, _: DType, _: f64, _: f64) -> Result<Self::Storage>;
 
     fn set_seed(&self, _: u64) -> Result<()>;
+    fn get_current_seed(&self) -> Result<u64>;
 
     /// Synchronize should block until all the operations on the device are completed.
     fn synchronize(&self) -> Result<()>;
 
@@ -1,5 +1,6 @@
 //! Implement conversion traits for tensors
 use crate::{DType, Device, Error, Tensor, WithDType};
+use float8::F8E4M3;
 use half::{bf16, f16, slice::HalfFloatSliceExt};
 use std::convert::TryFrom;
 
@@ -130,6 +131,16 @@ impl Tensor {
                     f.write_u32::<LittleEndian>(v)?
                 }
             }
+            DType::I16 => {
+                for v in vs.to_vec1::<i16>()? {
+                    f.write_i16::<LittleEndian>(v)?
+                }
+            }
+            DType::I32 => {
+                for v in vs.to_vec1::<i32>()? {
+                    f.write_i32::<LittleEndian>(v)?
+                }
+            }
             DType::I64 => {
                 for v in vs.to_vec1::<i64>()? {
                     f.write_i64::<LittleEndian>(v)?
@@ -139,6 +150,11 @@ impl Tensor {
                 let vs = vs.to_vec1::<u8>()?;
                 f.write_all(&vs)?;
             }
+            DType::F8E4M3 => {
+                for v in vs.to_vec1::<F8E4M3>()? {
+                    f.write_u8(v.to_bits())?
+                }
+            }
         }
         Ok(())
     }
 
@@ -1,10 +1,10 @@
-use super::{Cpu, CpuF16};
+use super::{Cpu, CpuBF16, CpuF16};
 #[cfg(target_arch = "x86")]
 use core::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use core::arch::x86_64::*;
 
-use half::f16;
+use half::{bf16, f16};
 
 pub struct CurrentCpu {}
 
@@ -146,3 +146,82 @@ impl CpuF16<ARR> for CurrentCpuF16 {
         *y = _mm_cvtss_f32(_mm_hadd_ps(t1, t1));
     }
 }
+
+pub struct CurrentCpuBF16 {}
+impl CpuBF16<ARR> for CurrentCpuBF16 {
+    type Unit = __m256;
+    type Array = [__m256; ARR];
+
+    const STEP: usize = STEP;
+    const EPR: usize = EPR;
+
+    fn n() -> usize {
+        ARR
+    }
+
+    unsafe fn zero() -> Self::Unit {
+        _mm256_setzero_ps()
+    }
+
+    unsafe fn zero_array() -> Self::Array {
+        [Self::zero(); ARR]
+    }
+
+    unsafe fn from_f32(v: f32) -> Self::Unit {
+        _mm256_set1_ps(v)
+    }
+
+    #[cfg(target_feature = "f16c")]
+    unsafe fn load(mem_addr: *const bf16) -> Self::Unit {
+        _mm256_cvtph_ps(_mm_loadu_si128(mem_addr as *const __m128i))
+    }
+
+    #[cfg(not(target_feature = "f16c"))]
+    unsafe fn load(mem_addr: *const bf16) -> Self::Unit {
+        let mut tmp = [0.0f32; 8];
+        for i in 0..8 {
+            tmp[i] = (*mem_addr.add(i)).to_f32();
+        }
+        _mm256_loadu_ps(tmp.as_ptr())
+    }
+
+    unsafe fn vec_add(a: Self::Unit, b: Self::Unit) -> Self::Unit {
+        _mm256_add_ps(a, b)
+    }
+
+    unsafe fn vec_fma(a: Self::Unit, b: Self::Unit, c: Self::Unit) -> Self::Unit {
+        _mm256_add_ps(_mm256_mul_ps(b, c), a)
+    }
+
+    #[cfg(target_feature = "f16c")]
+    unsafe fn vec_store(mem_addr: *mut bf16, a: Self::Unit) {
+        _mm_storeu_si128(mem_addr as *mut __m128i, _mm256_cvtps_ph(a, 0))
+    }
+
+    #[cfg(not(target_feature = "f16c"))]
+    unsafe fn vec_store(mem_addr: *mut bf16, a: Self::Unit) {
+        let mut tmp = [0.0f32; 8];
+        _mm256_storeu_ps(tmp.as_mut_ptr(), a);
+        for i in 0..8 {
+            *mem_addr.add(i) = bf16::from_f32(tmp[i]);
+        }
+    }
+
+    unsafe fn vec_reduce(mut x: Self::Array, y: *mut f32) {
+        let mut offset = ARR >> 1;
+        for i in 0..offset {
+            x[i] = _mm256_add_ps(x[i], x[offset + i]);
+        }
+        offset >>= 1;
+        for i in 0..offset {
+            x[i] = _mm256_add_ps(x[i], x[offset + i]);
+        }
+        offset >>= 1;
+        for i in 0..offset {
+            x[i] = _mm256_add_ps(x[i], x[offset + i]);
+        }
+        let t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), _mm256_extractf128_ps(x[0], 1));
+        let t1 = _mm_hadd_ps(t0, t0);
+        *y = _mm_cvtss_f32(_mm_hadd_ps(t1, t1));
+    }
+}