SaschaOnTour · SaschaOnTour · Apr 19, 2026 · Apr 19, 2026 · Apr 19, 2026 · Apr 19, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -30,17 +30,30 @@ jobs:
         env:
           RUSTFLAGS: "-Dwarnings"
 
+      - name: Clippy (candle)
+        run: cargo clippy --all-targets --features candle
+        env:
+          RUSTFLAGS: "-Dwarnings"
+
       - name: Build
         run: cargo build
 
+      - name: Build (candle)
+        run: cargo build --features candle
+
       - name: Test
         run: cargo test
 
+      - name: Test (candle)
+        run: cargo test --features candle
+
       - name: Security audit
         run: cargo install cargo-audit --locked && cargo audit
 
       - name: Install rustqual
         run: cargo install rustqual
 
       - name: Quality analysis
-        run: rustqual src/ --fail-on-warnings
+        # Scan the whole repository (src/, tests/, benches/, examples/)
+        # so that test-code quality issues are not silently ignored.
+        run: rustqual . --fail-on-warnings
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,4 @@ Cargo.lock
 .claude/
 coverage.lcov
 CLAUDE.md
+docs/
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,67 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.4.0] - 2026-04-19
+
+### Changed
+
+- **Breaking: per-layer locking architecture.** `PqoCache` and `TqCache` now
+  use `Vec<parking_lot::Mutex<LayerStorage>>` internally, so calls for
+  different layers no longer serialize on a global mutex. This enables
+  concurrent forward passes (e.g. speculative decoding draft + target) to
+  run without lock contention.
+- **Breaking: `mistralrs-kv-cache` trait bumped to 0.3**. All mutating
+  trait methods now take `&self` instead of `&mut self`. Inference engines
+  can now hold a plain `Arc<dyn CompressedKVCache>` instead of
+  `Arc<Mutex<dyn CompressedKVCache>>`. See `mistralrs-kv-cache`
+  [CHANGELOG 0.3.0](https://github.com/SaschaOnTour/mistralrs-kv-cache/blob/main/CHANGELOG.md#030---2026-04-19)
+  for the migration guide.
+- **`CompressedStorage` split**: public API pivots to `StorageMetadata` +
+  `LayerStorage` + `LayerBuffers<'_>`. `CompressedStorage` is removed.
+  `LayerStorage::buffers()` replaces the four individual
+  `k_indices`/`v_indices`/`k_scales`/`v_scales` accessors.
+- **Lazy `GpuPrecomputed` init** now uses `std::sync::OnceLock` with a
+  helper `ensure_gpu_precomputed()`, replacing the previous `&mut self`
+  `ensure_precomputed` method on each cache.
+- **Shared test-utility module**: `turboquant::test_utils` is now
+  `#[doc(hidden)] pub` so integration tests, benches, and examples can
+  import the LCG helpers and `make_kv` / `pseudo_random_vec` generators
+  without each redefining them. The module is publicly reachable (and
+  therefore part of the SemVer surface) but hidden from rustdoc; it is
+  intended only for cross-file test/bench/example code.
+
+### Added
+
+- **New concurrency tests** (`tests/cache_concurrency_tests.rs`):
+  - `parallel_decode_different_layers` — verifies two threads can decode
+    into layer 0 and layer 1 simultaneously.
+  - `parallel_prefill_no_corruption` — compares parallel vs serial prefill.
+  - `concurrent_reset_decode` — stress-tests reset/decode race.
+  - `layer_independence_under_contention` — 8 threads × 30 decodes, all
+    layers independent.
+- **`LayerStorage::validate()`** — cross-field invariant check, called
+  from `append` via `debug_assert!` to catch state inconsistencies.
+- **Upstream rustqual bug reports** — filed for three rustqual
+  false-positives encountered during the refactor.
+
+### Fixed
+
+- **IOSP violation in `TqCache::reset`** — switched to iterator-chain
+  form so rustqual no longer counts it as a logic+call violation.
+
+### Performance
+
+- Uncontended single-stream decode is unchanged (`parking_lot::Mutex` is
+  roughly 2× faster than `std::sync::Mutex` when uncontended).
+- Multi-stream / multi-layer concurrent decode is now truly parallel —
+  previously all layers serialized on one mutex per cache.
+
+## [0.3.1] - Undocumented release
+
+See [0.2.0] for the prior documented release.
+
+## [0.3.0] - Undocumented release
+
 ### Changed
 
 - **CI hardening**: All GitHub Actions pinned to immutable commit SHAs, explicit `permissions: contents: read`, `cargo audit` step added.

diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "turboquant-rs"
-version = "0.3.1"
+version = "0.4.0"
 edition = "2021"
 authors = ["Sascha <sascha@privora.com>"]
 description = "TurboQuant KV-Cache Quantization — 3-bit compression with zero accuracy loss (Zandieh et al., ICLR 2026)"
@@ -25,9 +25,10 @@ cuda = ["candle", "dep:cudaforge", "candle-core/cuda"]
 [dependencies]
 half = "2"
 thiserror = "2"
+parking_lot = "0.12"
 serde = { version = "1", features = ["derive"], optional = true }
-candle-core = { version = ">=0.10.2", optional = true }
-mistralrs-kv-cache = { version = ">=0.2.0", optional = true }
+candle-core = { version = "0.10.2", optional = true }
+mistralrs-kv-cache = { version = "0.3.0", optional = true }
 
 [build-dependencies]
 cudaforge = { version = "0.1.2", optional = true }

diff --git a/benches/quantize_bench.rs b/benches/quantize_bench.rs
@@ -1,12 +1,14 @@
 //! Criterion benchmarks for TurboQuant quantization, dequantization,
 //! QJL inner-product estimation, and attention operations.
 
+// qual:allow(BP-010) — criterion::bench_with_input closure signatures are mandated by the library and cannot be refactored
 use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
 
 use turboquant::attention::QuantizedKVCache;
 use turboquant::packed::TurboQuantConfig;
 use turboquant::qjl::{estimate_inner_product, precompute_query_projections, quantize_with_qjl};
 use turboquant::quantize::{dequantize_vec, quantize_vec};
+use turboquant::test_utils::pseudo_random_vec;
 
 // ---------------------------------------------------------------------------
 // Constants
@@ -21,9 +23,6 @@ const BITS_TQ4: u8 = 4;
 
 const ROTATION_SEED: u64 = 42;
 const QJL_SEED: u64 = 12345;
-const LCG_MULTIPLIER: u64 = 6_364_136_223_846_793_005;
-const LCG_INCREMENT: u64 = 1;
-const LCG_SHIFT: u32 = 33;
 
 const CACHE_SEQ_LEN: usize = 1024;
 const BENCH_NUM_LAYERS: usize = 1;
@@ -33,19 +32,6 @@ const BENCH_LAYER: usize = 0;
 // Helpers
 // ---------------------------------------------------------------------------
 
-fn pseudo_random_vec(dim: usize, seed: u64) -> Vec<f32> {
-    let mut state = seed;
-    (0..dim)
-        .map(|_| {
-            state = state
-                .wrapping_mul(LCG_MULTIPLIER)
-                .wrapping_add(LCG_INCREMENT);
-            let bits = (state >> LCG_SHIFT) as i32;
-            bits as f32 / (i32::MAX as f32)
-        })
-        .collect()
-}
-
 fn make_config(bits: u8, dim: usize) -> TurboQuantConfig {
     TurboQuantConfig::new(bits, dim)
         .unwrap()
@@ -56,6 +42,7 @@ fn make_config(bits: u8, dim: usize) -> TurboQuantConfig {
 // Benchmark: quantize_vec
 // ---------------------------------------------------------------------------
 
+// qual:allow(BP-010) — criterion benchmark_group idiom
 fn bench_quantize(c: &mut Criterion) {
     let mut group = c.benchmark_group("quantize_vec");
 
@@ -67,6 +54,7 @@ fn bench_quantize(c: &mut Criterion) {
     ] {
         let config = make_config(bits, dim);
         let data = pseudo_random_vec(dim, 1000);
+        // qual:allow(BP-010) — criterion idiom: `format!` label + bench_with_input closure is mandated by the library
         let label = format!("tq{bits}_d{dim}");
 
         group.bench_with_input(BenchmarkId::new("polarquant", &label), &data, |b, data| {

diff --git a/docs/benchmarks.md b/docs/benchmarks.md
diff --git a/examples/kv_cache_demo.rs b/examples/kv_cache_demo.rs
@@ -37,14 +37,7 @@ const NUM_ENTRIES: usize = 1024;
 /// Number of attention scores to display.
 const DISPLAY_SCORES: usize = 8;
 
-/// LCG multiplier (Knuth's constant).
-const LCG_MULTIPLIER: u64 = 6_364_136_223_846_793_005;
-
-/// LCG increment.
-const LCG_INCREMENT: u64 = 1;
-
-/// Right-shift for extracting bits from LCG state.
-const LCG_SHIFT: u32 = 33;
+use turboquant::test_utils::{pseudo_random_vec, LCG_MULTIPLIER};
 
 /// Amplitude for key vector generation.
 const KEY_AMPLITUDE: f32 = 1.0;
@@ -68,17 +61,12 @@ const BYTES_PER_KB: f64 = 1024.0;
 // Helpers
 // ---------------------------------------------------------------------------
 
-/// Deterministic pseudo-random vector using a simple LCG.
+/// Deterministic pseudo-random vector scaled by `amplitude`, delegating the
+/// core LCG to the shared `test_utils::pseudo_random_vec`.
 fn lcg_vec(dim: usize, seed: u64, amplitude: f32) -> Vec<f32> {
-    let mut state = seed;
-    (0..dim)
-        .map(|_| {
-            state = state
-                .wrapping_mul(LCG_MULTIPLIER)
-                .wrapping_add(LCG_INCREMENT);
-            let bits = (state >> LCG_SHIFT) as i32;
-            amplitude * (bits as f32 / i32::MAX as f32)
-        })
+    pseudo_random_vec(dim, seed)
+        .into_iter()
+        .map(|x| amplitude * x)
         .collect()
 }