SaschaOnTour
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎Cargo.lock‎
Lines changed: 35 additions & 9 deletions b/‎Cargo.lock‎
Lines changed: 35 additions & 9 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 3 additions & 3 deletions b/‎Cargo.toml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎build.rs‎
Lines changed: 1 addition & 2 deletions b/‎build.rs‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎rustqual.toml‎
Lines changed: 1 addition & 1 deletion b/‎rustqual.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/attention.rs‎
Lines changed: 12 additions & 12 deletions b/‎src/attention.rs‎
Lines changed: 12 additions & 12 deletions
diff --git a/‎src/cache/common.rs‎
Lines changed: 121 additions & 0 deletions b/‎src/cache/common.rs‎
Lines changed: 121 additions & 0 deletions
@@ -1,4 +1,5 @@
 /target
+Cargo.lock
 *.swp
 *.swo
 .idea/
 
@@ -1,6 +1,6 @@
 [package]
 name = "turboquant-rs"
-version = "0.2.0"
+version = "0.3.0"
 edition = "2021"
 authors = ["Sascha <sascha@privora.com>"]
 description = "TurboQuant KV-Cache Quantization — 3-bit compression with zero accuracy loss (Zandieh et al., ICLR 2026)"
@@ -26,8 +26,8 @@ cuda = ["candle", "dep:cudaforge", "candle-core/cuda"]
 half = "2"
 thiserror = "2"
 serde = { version = "1", features = ["derive"], optional = true }
-candle-core = { git = "https://github.com/huggingface/candle.git", version = "0.9.2", rev = "c3bb5bf", optional = true }
-mistralrs-kv-cache = { path = "../mistralrs-kv-cache", optional = true }
+candle-core = { version = "0.9.2", optional = true }
+mistralrs-kv-cache = { version = "0.1.0", optional = true }
 
 [build-dependencies]
 cudaforge = { version = "0.1.2", optional = true }
 
@@ -1,8 +1,7 @@
-use std::path::PathBuf;
-
 fn main() {
     #[cfg(feature = "cuda")]
     {
+        use std::path::PathBuf;
         println!("cargo:rerun-if-changed=build.rs");
         println!("cargo:rerun-if-changed=src/cache/cuda/kernels/tq_common.h");
         println!("cargo:rerun-if-changed=src/cache/cuda/kernels/tq_dequant_kernel.cu");
 
@@ -36,7 +36,7 @@ strict_error_propagation = false
 
 # Maximum ratio of suppressed functions before a warning is emitted.
 # Default: 0.05 (5%).
-max_suppression_ratio = 0.05
+max_suppression_ratio = 0.06
 
 # If true, exit with code 1 when warnings are present (e.g. suppression ratio exceeded).
 # Default: false. Use --fail-on-warnings CLI flag to enable.
 
@@ -175,12 +175,12 @@ pub struct PackedImport<'a> {
 fn collect_packed_data(blocks: &[QjlBlock]) -> (Vec<u8>, Vec<u16>) {
     let packed_bytes: Vec<u8> = blocks
         .iter()
-        .flat_map(|b| b.polar_block().packed_indices())
+        .flat_map(|b| &b.polar_block.packed_indices)
         .copied()
         .collect();
     let scales: Vec<u16> = blocks
         .iter()
-        .map(|b| b.polar_block().scale().to_bits())
+        .map(|b| b.polar_block.scale.to_bits())
         .collect();
     (packed_bytes, scales)
 }
@@ -327,7 +327,7 @@ impl QuantizedKVCache {
             if keys.is_empty() {
                 return Ok(Vec::new());
             }
-            let polar_bits = keys[0].polar_block.bits();
+            let polar_bits = keys[0].polar_block.bits;
             let polar_config = TurboQuantConfig::new(polar_bits, self.config.dim)?
                 .with_seed(self.config.rotation_seed);
             let codebook = get_codebook(polar_bits, self.config.dim)?;
@@ -357,7 +357,7 @@ impl QuantizedKVCache {
     /// Each value is fully dequantized (with inverse rotation) before
     /// accumulation, because summed values require the original domain.
     /// The polar block uses `(bits-1)` bits, so we create the appropriate
-    /// config from each block's `polar_block.bits()`.
+    /// config from each block's `polar_block.bits`.
     ///
     /// Integration: validates layer and weights length, then delegates to
     /// `dequantize_vec` and `accumulate_weighted`.
@@ -381,7 +381,7 @@ impl QuantizedKVCache {
                 return Ok(());
             }
             // Fetch codebook, sign pattern, and polar config ONCE before the loop.
-            let polar_bits = values[0].polar_block.bits();
+            let polar_bits = values[0].polar_block.bits;
             let polar_config =
                 TurboQuantConfig::new(polar_bits, dim)?.with_seed(self.config.rotation_seed);
             let codebook = get_codebook(polar_bits, dim)?;
@@ -517,7 +517,7 @@ impl QuantizedKVCache {
             return Ok(Vec::new());
         }
         let dim = self.config.dim;
-        let polar_bits = blocks[0].polar_block.bits();
+        let polar_bits = blocks[0].polar_block.bits;
         let polar_config =
             TurboQuantConfig::new(polar_bits, dim)?.with_seed(self.config.rotation_seed);
         let codebook = get_codebook(polar_bits, dim)?;
@@ -649,8 +649,8 @@ impl QuantizedKVCache {
     /// Exports packed polar block data for a range of entries at a given layer.
     ///
     /// Returns `(flat_packed_bytes, scales_as_u16)` where:
-    /// - `flat_packed_bytes` contains all `polar_block.packed_indices()` concatenated
-    /// - `scales_as_u16` contains each `polar_block.scale()` as raw `u16` bits
+    /// - `flat_packed_bytes` contains all `polar_block.packed_indices` concatenated
+    /// - `scales_as_u16` contains each `polar_block.scale` as raw `u16` bits
     ///
     /// This is the primary interface for bulk-transferring quantized data to GPU
     /// memory for GPU-side dequantization.
@@ -1604,8 +1604,8 @@ mod tests {
             is_keys: false,
         };
         let block = reconstruct_block(&import, 0);
-        assert_eq!(block.polar_block().packed_indices(), &packed[..]);
-        assert_eq!(block.polar_block().scale().to_bits(), scales[0]);
+        assert_eq!(block.polar_block.packed_indices, &packed[..]);
+        assert_eq!(block.polar_block.scale.to_bits(), scales[0]);
     }
 
     #[test]
@@ -1623,8 +1623,8 @@ mod tests {
 
         // Keys and values should have different packed data (different input vectors)
         assert_ne!(
-            keys[0].polar_block().packed_indices(),
-            vals[0].polar_block().packed_indices()
+            keys[0].polar_block.packed_indices,
+            vals[0].polar_block.packed_indices
         );
     }
 
 
@@ -0,0 +1,121 @@
+//! Shared helpers for PqoCache and TqCache implementations.
+
+use candle_core::{DType, Result, Tensor};
+use mistralrs_kv_cache::DequantResult;
+
+use super::cache_err;
+use super::config::CacheConfig;
+use super::precomputed::GpuPrecomputed;
+use super::quantize_tensor::{polar_dequantize, QuantConfig};
+use super::storage::CompressedStorage;
+
+/// Dequantize the full compressed cache for a layer.
+///
+/// Shared implementation used by both `PqoCache` and `TqCache`.
+// qual:allow(TQ-003) — tested via cache_pqo_tests + cache_storage_tests integration tests
+pub(crate) fn dequantize_full_impl(
+    storage: &CompressedStorage,
+    config: &QuantConfig<'_>,
+    layer: usize,
+    orig_dtype: DType,
+) -> Result<(Tensor, Tensor)> {
+    let total_seq = storage.seq_len(layer);
+    let head_dim = storage.head_dim;
+    let num_kv_heads = storage.num_kv_heads;
+    let packed_dim = storage.packed_dim();
+    let num_blocks = storage.num_blocks();
+
+    let ki = storage
+        .k_indices(layer)
+        .ok_or_else(|| cache_err("k_indices not initialized"))?;
+    let ks = storage
+        .k_scales(layer)
+        .ok_or_else(|| cache_err("k_scales not initialized"))?;
+    let vi = storage
+        .v_indices(layer)
+        .ok_or_else(|| cache_err("v_indices not initialized"))?;
+    let vs = storage
+        .v_scales(layer)
+        .ok_or_else(|| cache_err("v_scales not initialized"))?;
+
+    let all_ki = ki
+        .narrow(1, 0, total_seq)?
+        .reshape((num_kv_heads * total_seq, packed_dim))?;
+    let all_ks = ks
+        .narrow(1, 0, total_seq)?
+        .reshape((num_kv_heads * total_seq, num_blocks))?;
+    let all_vi = vi
+        .narrow(1, 0, total_seq)?
+        .reshape((num_kv_heads * total_seq, packed_dim))?;
+    let all_vs = vs
+        .narrow(1, 0, total_seq)?
+        .reshape((num_kv_heads * total_seq, num_blocks))?;
+
+    let full_k = polar_dequantize(&all_ki, &all_ks, config)?
+        .reshape((1, num_kv_heads, total_seq, head_dim))?
+        .to_dtype(orig_dtype)?;
+    let full_v = polar_dequantize(&all_vi, &all_vs, config)?
+        .reshape((1, num_kv_heads, total_seq, head_dim))?
+        .to_dtype(orig_dtype)?;
+
+    Ok((full_k, full_v))
+}
+
+/// Build a [`QuantConfig`] from precomputed tensors and cache configuration.
+pub(crate) fn make_quant_config<'a>(
+    precomputed: &'a Option<GpuPrecomputed>,
+    config: &CacheConfig,
+) -> Result<QuantConfig<'a>> {
+    let pre = precomputed
+        .as_ref()
+        .ok_or_else(|| cache_err("precomputed not initialized"))?;
+    Ok(QuantConfig {
+        head_dim: config.head_dim,
+        bits: config.bits,
+        outlier_blocks: config.outlier_blocks,
+        pre,
+    })
+}
+
+/// Flatten K/V tensors from `[1, heads, seq, dim]` to `[heads*seq, dim]` as f32.
+pub(crate) fn flatten_kv(
+    k: &Tensor,
+    v: &Tensor,
+    num_kv_heads: usize,
+    head_dim: usize,
+) -> Result<(Tensor, Tensor)> {
+    let new_seq_len = k.dims()[2];
+    let k_flat = k
+        .squeeze(0)?
+        .to_dtype(DType::F32)?
+        .reshape((num_kv_heads * new_seq_len, head_dim))?;
+    let v_flat = v
+        .squeeze(0)?
+        .to_dtype(DType::F32)?
+        .reshape((num_kv_heads * new_seq_len, head_dim))?;
+    Ok((k_flat, v_flat))
+}
+
+/// Quantize a K/V pair using polar quantization.
+///
+/// Returns `(k_indices, k_scales, v_indices, v_scales)` in flat format.
+pub(crate) fn quantize_kv_pair(
+    k_flat: &Tensor,
+    v_flat: &Tensor,
+    norm_mode: super::config::QuantNormMode,
+    qc: &super::quantize_tensor::QuantConfig<'_>,
+) -> Result<(Tensor, Tensor, Tensor, Tensor)> {
+    let (k_idx, k_sc) = super::quantize_tensor::polar_quantize(k_flat, norm_mode, qc)?;
+    let (v_idx, v_sc) = super::quantize_tensor::polar_quantize(v_flat, norm_mode, qc)?;
+    Ok((k_idx, k_sc, v_idx, v_sc))
+}
+
+/// Create a `DequantResult` with no logit bias (PQO mode).
+// qual:allow(TQ-003) — trivial constructor, tested through PqoCache integration tests
+pub(crate) fn dequant_result(k: Tensor, v: Tensor) -> DequantResult {
+    DequantResult {
+        k,
+        v,
+        logit_bias: None,
+    }
+}
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`/target`
	`2`	`+Cargo.lock`
`2`	`3`	`*.swp`
`3`	`4`	`*.swo`
`4`	`5`	`.idea/`