diff --git a/src/lib/mlxcel-core/src/cache/paged_detach.rs b/src/lib/mlxcel-core/src/cache/paged_detach.rs index 93cd4190..392b2db7 100644 --- a/src/lib/mlxcel-core/src/cache/paged_detach.rs +++ b/src/lib/mlxcel-core/src/cache/paged_detach.rs @@ -258,6 +258,7 @@ impl Drop for DetachedPagedCacheSet { /// Internal parked-cache representation. Used by the pool so both dense and /// paged detached sets share a single [`DetachedHandle`] space. +#[allow(clippy::large_enum_variant)] pub(super) enum ParkedCache { Dense(super::detach::DetachedCacheSet), Paged(DetachedPagedCacheSet), @@ -434,6 +435,7 @@ impl CachePool { /// /// Retained block pins are preserved across failure paths so the caller /// can still call [`CachePool::release_detached_paged`] manually or retry. + #[allow(clippy::result_large_err)] pub fn adopt_paged_preserving( &mut self, model: &dyn crate::generate::LanguageModel, diff --git a/src/lib/mlxcel-core/src/cache/turbo/boundary.rs b/src/lib/mlxcel-core/src/cache/turbo/boundary.rs index fc73ae89..c0e08cce 100644 --- a/src/lib/mlxcel-core/src/cache/turbo/boundary.rs +++ b/src/lib/mlxcel-core/src/cache/turbo/boundary.rs @@ -373,9 +373,9 @@ mod tests { // Cross-check the single-layer helper against the bulk helper. let n = 16; let bulk = resolve_layer_modes(KVCacheMode::Turbo4Asym, n, 2); - for i in 0..n { + for (i, &expected) in bulk.iter().enumerate().take(n) { let single = resolve_layer_mode(KVCacheMode::Turbo4Asym, i, n, 2); - assert_eq!(bulk[i], single, "layer {i} mismatch"); + assert_eq!(expected, single, "layer {i} mismatch"); } } } diff --git a/src/lib/mlxcel-core/src/cache/turbo/codebook.rs b/src/lib/mlxcel-core/src/cache/turbo/codebook.rs index d3a2620a..87dee057 100644 --- a/src/lib/mlxcel-core/src/cache/turbo/codebook.rs +++ b/src/lib/mlxcel-core/src/cache/turbo/codebook.rs @@ -507,7 +507,7 @@ mod tests { // Φ(−1) ≈ 0.15865525393145702 let v = gaussian_cdf(-1.0); - assert!((v - 0.158_655_253_931_457_0).abs() < 1e-12, "Φ(-1) = {v}"); + assert!((v - 0.158_655_253_931_457).abs() < 1e-12, "Φ(-1) = {v}"); } #[test] diff --git a/src/lib/mlxcel-core/src/cache/turbo/pack3.rs b/src/lib/mlxcel-core/src/cache/turbo/pack3.rs index bfaf4610..577dfe3b 100644 --- a/src/lib/mlxcel-core/src/cache/turbo/pack3.rs +++ b/src/lib/mlxcel-core/src/cache/turbo/pack3.rs @@ -87,7 +87,7 @@ pub const V_BIT_WIDTH_3: u8 = 3; #[inline] pub fn packed_bytes_per_token_3bit(head_dim: i32) -> i32 { debug_assert!( - head_dim > 0 && (head_dim as usize) % COORDS_PER_GROUP == 0, + head_dim > 0 && (head_dim as usize).is_multiple_of(COORDS_PER_GROUP), "packed_bytes_per_token_3bit: head_dim must be a positive multiple of 8; \ got {head_dim}" ); @@ -109,7 +109,7 @@ pub fn packed_bytes_per_token_3bit(head_dim: i32) -> i32 { /// pipeline). pub fn pack_3bit_indices(indices: &[u8], out: &mut [u8]) { debug_assert!( - indices.len() % COORDS_PER_GROUP == 0, + indices.len().is_multiple_of(COORDS_PER_GROUP), "pack_3bit_indices: index count ({}) must be a multiple of {}", indices.len(), COORDS_PER_GROUP @@ -147,7 +147,7 @@ pub fn pack_3bit_indices(indices: &[u8], out: &mut [u8]) { /// dequant path). pub fn unpack_3bit_indices(packed: &[u8], out: &mut [u8]) { debug_assert!( - packed.len() % BYTES_PER_GROUP == 0, + packed.len().is_multiple_of(BYTES_PER_GROUP), "unpack_3bit_indices: packed byte count ({}) must be a multiple of {}", packed.len(), BYTES_PER_GROUP @@ -311,7 +311,7 @@ mod tests { #[test] fn round_trip_random_multi_group() { // 32 indices = 4 groups = 12 bytes packed. Use a deterministic LCG. - let mut state: u32 = 0xC0FFEE_42; + let mut state: u32 = 0xC0FF_EE42; let mut indices = vec![0u8; 32]; for x in &mut indices { state = state.wrapping_mul(1_664_525).wrapping_add(1_013_904_223); diff --git a/src/lib/mlxcel-core/src/cache/turbo/quant.rs b/src/lib/mlxcel-core/src/cache/turbo/quant.rs index 1f62c84e..e695cdd0 100644 --- a/src/lib/mlxcel-core/src/cache/turbo/quant.rs +++ b/src/lib/mlxcel-core/src/cache/turbo/quant.rs @@ -419,7 +419,7 @@ fn quantize_into_packed( let idx_off = tok * coords_per_token; let mut sum_sq = 0.0_f32; for d_i in 0..coords_per_token { - let c = centroids[indices[idx_off + d_i] as usize]; + let c = centroids[indices[idx_off + d_i]]; sum_sq += c * c; } let y_hat_norm = sum_sq.sqrt(); @@ -874,8 +874,7 @@ mod tests { let hd_usize = head_dim as usize; let mut k_data: Vec = Vec::with_capacity(n_tokens * hd_usize); let token_scales = [0.5_f32, 1.5, 4.0, 12.0]; - for tok in 0..n_tokens { - let scale = token_scales[tok]; + for &scale in token_scales.iter().take(n_tokens) { for _ in 0..head_dim { let mut acc = 0.0_f32; for _ in 0..6 { @@ -981,8 +980,7 @@ mod tests { // norm storage, which is acceptable in production but would noise // up this test. let token_scales = [0.5_f32, 1.5, 4.0, 12.0]; - for tok in 0..4 { - let scale = token_scales[tok]; + for &scale in &token_scales { for _ in 0..head_dim { // Pseudo-Gaussian via summed uniform bits → ~normal in [-1, 1]. let mut acc = 0.0_f32; @@ -1057,7 +1055,7 @@ mod tests { let head_dim: i32 = 128; let params = TurboQuantParams::new(head_dim as u32, 0xC1_DECAFu32); - let mut rng = Lcg32::new(0xC0FFEE_42); + let mut rng = Lcg32::new(0xC0FF_EE42); let n_tokens = 4usize; let hd_usize = head_dim as usize; let mut v_data: Vec = Vec::with_capacity(n_tokens * hd_usize); diff --git a/src/lib/mlxcel-core/src/cache/turbo/quant3.rs b/src/lib/mlxcel-core/src/cache/turbo/quant3.rs index 0d554c4e..e520ee6e 100644 --- a/src/lib/mlxcel-core/src/cache/turbo/quant3.rs +++ b/src/lib/mlxcel-core/src/cache/turbo/quant3.rs @@ -111,7 +111,7 @@ impl TurboQuantParams3 { "TurboQuantParams3::new: head_dim must be a non-zero power of two; got {head_dim}" ); assert!( - head_dim % 8 == 0, + head_dim.is_multiple_of(8), "TurboQuantParams3::new: head_dim must be a multiple of 8 \ for the 24-bit packing layout; got {head_dim}" ); @@ -373,8 +373,7 @@ mod tests { let mut state: u32 = 123; let mut v_data: Vec = Vec::with_capacity(4 * head_dim as usize); let token_scales = [0.5_f32, 1.5, 4.0, 12.0]; - for tok in 0..4 { - let scale = token_scales[tok]; + for &scale in &token_scales { for _ in 0..head_dim { let mut acc = 0.0_f32; for _ in 0..6 { diff --git a/src/lib/mlxcel-core/src/cache/turbo/sparse_v.rs b/src/lib/mlxcel-core/src/cache/turbo/sparse_v.rs index 8bd9454c..177d8fd0 100644 --- a/src/lib/mlxcel-core/src/cache/turbo/sparse_v.rs +++ b/src/lib/mlxcel-core/src/cache/turbo/sparse_v.rs @@ -380,6 +380,7 @@ pub fn compute_alive_mask( /// Used by: tests under `cache/turbo_tests.rs` (issue #480 unit tests). /// Production attention call sites continue to use the standard /// `attention()` path until the Metal kernel lands. +#[allow(clippy::too_many_arguments)] pub fn attention_sparse_v_turbo4( q: &MlxArray, k: &MlxArray, @@ -526,6 +527,7 @@ pub fn attention_sparse_v_turbo4( /// case. /// /// Used by: [`KVCache::sparse_v_attention`] (issue #505). +#[allow(clippy::too_many_arguments)] pub fn attention_sparse_v_turbo4_fused( q: &MlxArray, k: &MlxArray, diff --git a/src/lib/mlxcel-core/src/cache/turbo_tests.rs b/src/lib/mlxcel-core/src/cache/turbo_tests.rs index 829b707f..d1d22a67 100644 --- a/src/lib/mlxcel-core/src/cache/turbo_tests.rs +++ b/src/lib/mlxcel-core/src/cache/turbo_tests.rs @@ -364,8 +364,8 @@ fn turbo4_asym_clone_handle_install_then_dequant_matches_pre_detach() { let v_data: Vec = (0..2 * head_dim) .map(|i| (((i * 7) % 13) as f32 / 13.0) - 0.5) .collect(); - let k = ffi::from_slice_f32(&k_data, &[1, 1, 2, head_dim as i32]); - let v = ffi::from_slice_f32(&v_data, &[1, 1, 2, head_dim as i32]); + let k = ffi::from_slice_f32(&k_data, &[1, 1, 2, head_dim]); + let v = ffi::from_slice_f32(&v_data, &[1, 1, 2, head_dim]); let (_k1, v1_out) = cache.update_and_fetch(k, v); let v1 = flatten_fp32(&v1_out); @@ -901,13 +901,13 @@ mod boundary_v { fn typical_32_layer_split_protects_first_two_and_last_two() { let n = 32; let modes = resolve_layer_modes(KVCacheMode::Turbo4Asym, n, 2); - for i in 0..n { + for (i, &actual) in modes.iter().enumerate().take(n) { let expected = if i < 2 || i >= n - 2 { KVCacheMode::Fp16 } else { KVCacheMode::Turbo4Asym }; - assert_eq!(modes[i], expected, "layer {i}"); + assert_eq!(actual, expected, "layer {i}"); } } @@ -922,10 +922,10 @@ mod boundary_v { KVCacheMode::Turbo4Delegated, ] { let bulk = resolve_layer_modes(mode, n, 2); - for i in 0..n { + for (i, &expected) in bulk.iter().enumerate().take(n) { let single = resolve_layer_mode(mode, i, n, 2); assert_eq!( - bulk[i], single, + expected, single, "{mode:?} layer {i}: bulk vs single helper disagree" ); } @@ -1414,8 +1414,8 @@ fn turbo3_asym_clone_handle_install_then_dequant_matches_pre_detach() { let v_data: Vec = (0..2 * head_dim) .map(|i| (((i * 7) % 13) as f32 / 13.0) - 0.5) .collect(); - let k = ffi::from_slice_f32(&k_data, &[1, 1, 2, head_dim as i32]); - let v = ffi::from_slice_f32(&v_data, &[1, 1, 2, head_dim as i32]); + let k = ffi::from_slice_f32(&k_data, &[1, 1, 2, head_dim]); + let v = ffi::from_slice_f32(&v_data, &[1, 1, 2, head_dim]); let (_k1, v1_out) = cache.update_and_fetch(k, v); let v1 = flatten_fp32(&v1_out); @@ -1667,7 +1667,7 @@ fn turbo3_asym_layer_modes_apply_boundary_upgrade() { assert_eq!(modes.len(), 8); // First 2 + last 2 are upgraded to FP16; middle 4 stay Turbo3Asym. for (i, m) in modes.iter().enumerate() { - if i < 2 || i >= 6 { + if !(2..6).contains(&i) { assert_eq!(*m, KVCacheMode::Fp16, "layer {i} must be FP16 boundary"); } else { assert_eq!( @@ -2608,8 +2608,8 @@ fn delegated_dequant_sdpa_matches_reference_attention() { /// Steel-envelope vs cold-only fused composition parity (issue #531). /// /// `update_and_turbo4_delegated_attention` first tries the steel-envelope -/// kernel (issue #531, single Metal dispatch covering softmax + cold-V dequant -/// + hot-V accumulate). On the same hardware where the cold-only fused +/// kernel (issue #531, single Metal dispatch covering softmax, cold-V dequant, +/// and hot-V accumulate). On the same hardware where the cold-only fused /// composition path (issue #528) already produces correct output, the /// steel-envelope path must produce numerically equivalent output (within /// FP16 round-off) — both paths funnel through the same MLX softmax algebra diff --git a/src/lib/mlxcel-core/src/drafter/dflash/round_loop_batched.rs b/src/lib/mlxcel-core/src/drafter/dflash/round_loop_batched.rs index f17a9e3b..b9b12c55 100644 --- a/src/lib/mlxcel-core/src/drafter/dflash/round_loop_batched.rs +++ b/src/lib/mlxcel-core/src/drafter/dflash/round_loop_batched.rs @@ -714,6 +714,7 @@ mod tests { /// /// `argmax_fn(row, position, prev_token) -> next_token`. Tests seed /// this so the accept pattern is deterministic. + #[allow(clippy::type_complexity)] struct SyntheticTarget { argmax_fn: Box, concat_hidden_dim: i32, diff --git a/src/lib/mlxcel-core/src/generate.rs b/src/lib/mlxcel-core/src/generate.rs index cfa7bdbb..adc8a413 100644 --- a/src/lib/mlxcel-core/src/generate.rs +++ b/src/lib/mlxcel-core/src/generate.rs @@ -411,6 +411,7 @@ pub trait LanguageModel { /// amortizes weight-loading bandwidth across the batch. /// /// Used by: BatchScheduler (server continuous batching) + #[allow(clippy::needless_range_loop)] fn forward_batched( &self, input_ids: &MlxArray, @@ -822,7 +823,7 @@ impl CxxGenerator { let n_layers = self.caches.len(); let requested = crate::cache::turbo::boundary_v_layers_from_env(); let layer_modes = crate::cache::turbo::resolve_layer_modes(nominal, n_layers, requested); - for (cache, mode) in self.caches.iter_mut().zip(layer_modes.into_iter()) { + for (cache, mode) in self.caches.iter_mut().zip(layer_modes) { cache.mode = mode; } } @@ -2096,7 +2097,7 @@ mod tests { // n=0 is the very first decode iteration after prefill. Clearing here // would discard tensors needed for the pipelined next-step computation. let n = 0_usize; - assert!(!(n % 256 == 0 && n > 0)); + assert!(!(n.is_multiple_of(256) && n > 0)); } #[test] @@ -2259,8 +2260,10 @@ mod tests { let caller_bias = make_bias(&[(99, -3.0)]); let g = CxxGenerator::new(2).with_token_bias(cached); - let mut caller = SamplingConfig::default(); - caller.token_bias = caller_bias; + let caller = SamplingConfig { + token_bias: caller_bias, + ..SamplingConfig::default() + }; let composed = g.compose_sampling(&caller); // Caller's explicit bias is preserved verbatim, cached bias is ignored. diff --git a/src/lib/mlxcel-core/src/layers.rs b/src/lib/mlxcel-core/src/layers.rs index 7f5b1484..67946528 100644 --- a/src/lib/mlxcel-core/src/layers.rs +++ b/src/lib/mlxcel-core/src/layers.rs @@ -847,6 +847,7 @@ impl FusedQKVLinear { /// Preserves both quantization `biases` and true linear `bias` tensors /// when present; Qwen2-family checkpoints require q/k/v linear bias for /// sane logits. + #[allow(clippy::too_many_arguments)] pub fn from_weights_separate_with_mode( weights: &crate::weights::WeightMap, prefix: &str, @@ -1773,6 +1774,7 @@ fn na_attention_log_mode() -> NaAttentionLogMode { }) } +#[allow(clippy::too_many_arguments)] fn classify_na_attention_dispatch( q: &MlxArray, k: &MlxArray, @@ -1813,6 +1815,7 @@ fn classify_na_attention_dispatch( (route, fast_path_eligible, nax_eligible) } +#[allow(clippy::too_many_arguments)] fn na_attention_eligibility_reasons( q: &MlxArray, k: &MlxArray, @@ -1879,6 +1882,7 @@ fn na_attention_eligibility_reasons( (fast_reason, nax_reason) } +#[allow(clippy::too_many_arguments)] fn record_na_attention_dispatch( q: &MlxArray, k: &MlxArray, @@ -1894,7 +1898,7 @@ fn record_na_attention_dispatch( let count = DISPATCH_COUNT.fetch_add(1, Ordering::Relaxed) + 1; let should_log = match na_attention_log_mode() { NaAttentionLogMode::Off => false, - NaAttentionLogMode::Sampled => count <= 8 || count % 100 == 0, + NaAttentionLogMode::Sampled => count <= 8 || count.is_multiple_of(100), NaAttentionLogMode::All => true, }; if should_log { @@ -1993,6 +1997,11 @@ pub fn attention( /// Pointer-friendly attention wrapper for existing model call sites. /// /// Used by: Model attention call sites that still store masks as raw pointers +/// +/// # Safety +/// +/// `mask` must be either null or point to a valid, live `MlxArray`. The +/// referent must remain valid for the duration of this call. pub unsafe fn attention_from_ptr( q: &MlxArray, k: &MlxArray, diff --git a/src/lib/mlxcel-core/src/sampling.rs b/src/lib/mlxcel-core/src/sampling.rs index d273ba9d..9fd630f6 100644 --- a/src/lib/mlxcel-core/src/sampling.rs +++ b/src/lib/mlxcel-core/src/sampling.rs @@ -977,11 +977,10 @@ mod tests { "top token should survive nucleus (got {})", v[0] ); - for i in 1..5 { + for (i, val) in v.iter().enumerate().take(5).skip(1) { assert!( - v[i].is_infinite() && v[i] < 0.0, - "token {i} should be filtered to -inf (got {})", - v[i] + val.is_infinite() && *val < 0.0, + "token {i} should be filtered to -inf (got {val})", ); } } diff --git a/src/lib/mlxcel-core/src/speculative/mod.rs b/src/lib/mlxcel-core/src/speculative/mod.rs index 2886840a..15b198d4 100644 --- a/src/lib/mlxcel-core/src/speculative/mod.rs +++ b/src/lib/mlxcel-core/src/speculative/mod.rs @@ -746,8 +746,10 @@ mod tests { let caller_bias = make_bias(&[(42, f32::NEG_INFINITY)]); let g = SpeculativeGenerator::new(2, 1).with_token_bias(cached); - let mut caller = SamplingConfig::default(); - caller.token_bias = caller_bias; + let caller = SamplingConfig { + token_bias: caller_bias, + ..SamplingConfig::default() + }; let target = g.compose_target_sampling(&caller); assert_eq!( diff --git a/src/lib/mlxcel-core/src/speculative/mtp/generator.rs b/src/lib/mlxcel-core/src/speculative/mtp/generator.rs index 383725cd..95faae56 100644 --- a/src/lib/mlxcel-core/src/speculative/mtp/generator.rs +++ b/src/lib/mlxcel-core/src/speculative/mtp/generator.rs @@ -26,15 +26,15 @@ //! arm its `set_shared_kv` with the seed slabs. //! 3. **Round loop.** While more tokens remain: //! a. Drafter produces `K-1` proposals via -//! [`crate::drafter::Drafter::draft_block`] (autoregressive, with -//! RoPE queries frozen at the bonus token's absolute position). +//! [`crate::drafter::Drafter::draft_block`] (autoregressive, with +//! RoPE queries frozen at the bonus token's absolute position). //! b. Target verify on `[bonus, draft_0, …, draft_{K-2}]` via -//! [`MtpTarget::verify`] — produces `target_tokens`, the next -//! hidden, and the re-sliced shared K/V. +//! [`MtpTarget::verify`] — produces `target_tokens`, the next +//! hidden, and the re-sliced shared K/V. //! c. Compare draft vs target via [`super::speculative_walk`]. //! d. Emit `new_tokens`. Update `bonus` to the last emitted token. //! e. Rebind the drafter against the new shared K/V via -//! [`crate::drafter::Drafter::set_shared_kv`]. +//! [`crate::drafter::Drafter::set_shared_kv`]. //! 4. **Termination.** Loop exits when an emitted token is in the //! target's `eos_token_ids()` or `emitted >= max_tokens`. diff --git a/src/lib/mlxcel-core/src/speculative/mtp/tests.rs b/src/lib/mlxcel-core/src/speculative/mtp/tests.rs index f000c420..1b38da49 100644 --- a/src/lib/mlxcel-core/src/speculative/mtp/tests.rs +++ b/src/lib/mlxcel-core/src/speculative/mtp/tests.rs @@ -579,12 +579,7 @@ fn greedy_parity_perfect_drafter_matches_no_drafter_baseline_32_tokens() { let scripted_target: Vec> = (0..11) .map(|r| { let base = 1000 + r * 4; - vec![ - base as i32, - (base + 1) as i32, - (base + 2) as i32, - (base + 3) as i32, - ] + vec![base, (base + 1), (base + 2), (base + 3)] }) .collect(); let scripted_draft: Vec> = scripted_target diff --git a/src/lib/mlxcel-core/src/utils.rs b/src/lib/mlxcel-core/src/utils.rs index 13207b56..450ce211 100644 --- a/src/lib/mlxcel-core/src/utils.rs +++ b/src/lib/mlxcel-core/src/utils.rs @@ -473,7 +473,7 @@ pub fn align_to_na_tile(len: usize) -> usize { if len == 0 { return 0; } - ((len + NA_TILE_SIZE - 1) / NA_TILE_SIZE) * NA_TILE_SIZE + len.div_ceil(NA_TILE_SIZE) * NA_TILE_SIZE } /// Create a causal attention mask for a tile-aligned padded prefill. @@ -510,8 +510,11 @@ pub fn create_padded_prefill_mask( // Shape: [1, total_kv] (broadcast along the query axis). // Value: 1 for valid key positions, 0 for padding key positions. let mut valid_mask_data = vec![0f32; total_kv as usize]; - for i in 0..(actual_len + offset) as usize { - valid_mask_data[i] = 1.0; + for v in valid_mask_data + .iter_mut() + .take((actual_len + offset) as usize) + { + *v = 1.0; } let valid_mask = ffi::from_slice_f32(&valid_mask_data, &[1, total_kv]); @@ -609,7 +612,7 @@ pub fn pipeline_hint(hidden: &MlxArray, layer_idx: usize, total_layers: usize) { ffi::async_eval(hidden); } PipelineMode::PerBlock(n) => { - if (layer_idx + 1) % n == 0 { + if (layer_idx + 1).is_multiple_of(*n) { ffi::async_eval(hidden); } } diff --git a/src/lib/mlxcel-surgery/src/ops/add_test_helpers.rs b/src/lib/mlxcel-surgery/src/ops/add_test_helpers.rs index c93e3ce2..99911ffa 100644 --- a/src/lib/mlxcel-surgery/src/ops/add_test_helpers.rs +++ b/src/lib/mlxcel-surgery/src/ops/add_test_helpers.rs @@ -22,8 +22,6 @@ //! //! Used by: `add_tests`, `add_apply_tests`. -#![cfg(test)] - use std::path::Path; use mlxcel_core::dtype as mlx_dtype; diff --git a/src/server/batch/speculative_burst.rs b/src/server/batch/speculative_burst.rs index e054a048..0aeca2f1 100644 --- a/src/server/batch/speculative_burst.rs +++ b/src/server/batch/speculative_burst.rs @@ -1698,7 +1698,7 @@ pub(crate) fn try_run_burst_batched( Ok(rows_tokens) => { debug_assert_eq!(rows_tokens.len(), batch_size); let mut finalized_rows = Vec::with_capacity(batch_size); - for (seq, tokens) in seqs.into_iter().zip(rows_tokens.into_iter()) { + for (seq, tokens) in seqs.into_iter().zip(rows_tokens) { let seq_id = seq.seq_id; // Transition Queued -> Prefilling now that the burst // owned the row's lifecycle.