mlxcel/src/lib/mlxcel-core/src/generate.rs at 46915c1ede458d98209841799e33253d06330563 · lablup/mlxcel · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright 2025-2026 Lablup Inc. and Jeongkyu Shin
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//! Token generation utilities for mlxcel-core models
//!
//! This module provides the generation loop and sampling functions
//! for text generation with mlxcel-core models.
//!
//! Key optimizations matching Python mlx-lm:
//! - Dedicated generation stream for pipelined execution
//! - Lookahead pipelining: compute token n+1 while returning token n
//! - Optimized decode loops for standard and embedding-prefill paths
//! - Shared sampling policy delegated to `crate::sampling`
//! - Shared decode setup delegated to `crate::generation_policy`

use std::borrow::Cow;

use crate::cache::{CachePool, KVCacheMode, SequenceId};
use crate::ffi;
use crate::ffi::{MlxArray, MlxThreadLocalStream};
use crate::generation_policy::{
    ensure_model_caches, initial_token_history, merged_eos_token_ids, seed_rng_if_needed,
};
use crate::hardware;
use crate::layers::KVCache;
use crate::sampling::{sample_token_optimized, TokenBiasMap};
use crate::streams::{install_thread_local_default_stream, new_thread_local_generation_stream};
use crate::utils::{align_to_na_tile, create_padded_prefill_mask};
use cxx::UniquePtr;

/// Returns true when the current hardware is M5+ with a Neural Accelerator
/// and tile-aligned prefill should be applied.
///
/// Set `MLXCEL_NO_PADDED_PREFILL=1` to disable tile alignment (debugging).
#[inline]
fn should_align_prefill() -> bool {
    if std::env::var("MLXCEL_NO_PADDED_PREFILL").is_ok() {
        return false;
    }
    let hw = hardware::get_hardware();
    hw.has_neural_accelerator && hw.macos_supports_na
}

#[inline]
fn force_padded_prefill_array_mask() -> bool {
    std::env::var("MLXCEL_FORCE_PADDED_PREFILL_MASK").is_ok()
}

/// Pad a prompt token slice to `padded_len` with the pad token (0) and return
/// both the padded slice and an appropriate attention mask.
///
/// If `actual_len == padded_len` no padding is needed: returns the original
/// tokens and `None` (the forward pass will use its built-in causal mask).
///
/// If `actual_len < padded_len` the sequence is extended with zeros and a
/// padded causal mask is returned so that padding positions do not leak into
/// the KV cache values.
///
/// # Arguments
/// * `prompt_tokens` - Original token IDs.
/// * `padded_len`    - Target aligned length (≥ `prompt_tokens.len()`).
///
/// # Returns
/// `(padded_tokens_vec, mask_or_none)` where `mask_or_none` is `None` when no
/// padding was added.
fn pad_tokens_for_prefill(
    prompt_tokens: &[i32],
    padded_len: usize,
    use_maskless_causal: bool,
) -> (Vec<i32>, Option<UniquePtr<MlxArray>>) {
    let actual_len = prompt_tokens.len();
    if padded_len == actual_len {
        return (prompt_tokens.to_vec(), None);
    }

    let mut padded = Vec::with_capacity(padded_len);
    padded.extend_from_slice(prompt_tokens);
    padded.resize(padded_len, 0); // pad with token id 0

    if use_maskless_causal && !force_padded_prefill_array_mask() {
        return (padded, None);
    }

    let mask = create_padded_prefill_mask(actual_len as i32, padded_len as i32, 0);
    (padded, Some(mask))
}

/// After a padded prefill, trim all KV caches back to `actual_len` so that
/// the decode phase starts with the correct sequence position.
///
/// The padded token positions `[actual_len, padded_len)` were written to the
/// cache during the forward pass; trimming removes them so the KV cache offset
/// reflects only the real prompt tokens.
fn trim_caches_to_actual_len(caches: &mut [KVCache], actual_len: usize, padded_len: usize) {
    let excess = (padded_len - actual_len) as i32;
    if excess <= 0 {
        return;
    }
    for cache in caches.iter_mut() {
        cache.trim(excess);
    }
}

/// Pad an embeddings tensor from `[batch, actual_len, hidden]` to
/// `[batch, padded_len, hidden]` by appending zero rows.
///
/// Used by the VLM tile-alignment path to match the padded token sequence.
fn pad_embeddings(embeds: &MlxArray, padded_len: usize) -> UniquePtr<MlxArray> {
    let shape = ffi::array_shape(embeds);
    let batch = shape[0];
    let actual_seq = shape[1] as usize;
    let hidden = shape[2];
    if padded_len <= actual_seq {
        return ffi::slice(embeds, &[0, 0, 0], &[batch, actual_seq as i32, hidden]);
    }
    let pad_rows = (padded_len - actual_seq) as i32;
    let dtype = ffi::array_dtype(embeds);
    let padding = ffi::zeros(&[batch, pad_rows, hidden], dtype);
    crate::concatenate(embeds, &padding, 1)
}

/// Extract the logits at a specific sequence position, returning shape
/// `[batch, 1, vocab]` to remain compatible with `slice_last_logits`.
///
/// `logits` has shape `[batch, seq_len, vocab]`. Slices out position `pos`
/// along the sequence axis (keeping the dimension as size 1) so that the
/// caller can still pass the result to `sample_token_optimized`, which
/// internally calls `slice_last_logits` expecting `[batch, seq_len, vocab]`.
///
/// Used after a padded prefill to obtain the prediction for the last *real*
/// token position rather than the last padding position.
fn logits_at_position(logits: &MlxArray, pos: usize) -> UniquePtr<MlxArray> {
    let shape = ffi::array_shape(logits);
    let batch = shape[0];
    let vocab = shape[2];
    // Slice [batch, pos:pos+1, vocab]  →  shape [batch, 1, vocab].
    ffi::slice(logits, &[0, pos as i32, 0], &[batch, pos as i32 + 1, vocab])
}

/// Trait for language models that can be used for generation
pub trait LanguageModel {
    /// Forward pass through the model
    /// Returns logits of shape [batch, seq_len, vocab_size]
    fn forward(
        &self,
        input_ids: &MlxArray,
        caches: &mut [KVCache],
        mask: Option<&MlxArray>,
    ) -> UniquePtr<MlxArray>;

    /// Create KV caches for all layers
    fn make_caches(&self) -> Vec<KVCache>;

    /// Get the number of layers
    fn num_layers(&self) -> usize;

    /// Get the EOS token IDs for this model
    fn eos_token_ids(&self) -> Vec<i32>;

    /// Forward with pre-computed embeddings (for VLM prefill)
    /// Used by: VisionLanguageModel (Gemma3 VLM)
    fn forward_with_embeddings(
        &self,
        input_ids: &MlxArray,
        input_embeddings: Option<&MlxArray>,
        caches: &mut [KVCache],
        mask: Option<&MlxArray>,
    ) -> UniquePtr<MlxArray> {
        // Default: ignore embeddings, use standard forward
        let _ = input_embeddings;
        self.forward(input_ids, caches, mask)
    }

    /// Get embeddings for token IDs (needed by VisionModule for merging)
    /// Used by: VisionModule::get_input_embeddings
    fn embed_tokens(&self, _input_ids: &MlxArray) -> Option<UniquePtr<MlxArray>> {
        None // default: not supported
    }

    /// Hand out a shared-buffer handle to this model's input embedding
    /// table for speculative drafters that lazy-bind it.
    ///
    /// Unlike [`Self::embed_tokens`] (which applies the embedding to a
    /// given id tensor), this returns the embedding *module* itself so a
    /// drafter can use it both as an embedding lookup and as a tied LM
    /// head (`UnifiedEmbedding::as_linear`). The returned
    /// [`UnifiedEmbedding`] shares the underlying MLX buffers with the
    /// target (lazy-array share via `UnifiedEmbedding::clone_shared` — no
    /// element copy) and stays valid for the lifetime of the speculative
    /// session.
    ///
    /// The default returns `None`; only targets that can pair with a
    /// lazy-bind drafter override it. Concretely, the upstream
    /// `z-lab/Qwen3.5-4B-DFlash` checkpoint omits `embed_tokens.weight`
    /// and the Rust DFlash drafter resolves it here during
    /// [`crate::drafter::Drafter::bind`].
    ///
    /// Used by: DFlash drafter lazy-bind path; Gemma 4 MTP assistant
    /// binding; Qwen 3.5 target family; Gemma 4 target family
    fn embed_tokens_module(&self) -> Option<crate::layers::UnifiedEmbedding> {
        None // default: not supported
    }

    /// Hand out a shared-buffer handle to this model's output projection
    /// when the projection is untied from the input embedding table.
    ///
    /// Some DFlash checkpoints (for example `z-lab/Qwen3.5-27B-DFlash`)
    /// omit both `embed_tokens.weight` and `lm_head.weight`; upstream Python
    /// binds both modules from the target at runtime, falling back to
    /// `embed_tokens.as_linear` only when the target has no explicit head.
    /// The default returns `None` so tied-embedding models keep using the
    /// embedding table path.
    ///
    /// Used by: DFlash drafter lazy-bind path for untied Qwen 3.5 targets.
    fn lm_head_module(&self) -> Option<crate::layers::UnifiedLinear> {
        None // default: tied or unsupported
    }

    /// Called once after prefill completes and before decode starts.
    /// Used by models that need to adjust internal state between phases,
    /// e.g. Phi4MM unfuses vision LoRA so decode uses base weights.
    fn after_prefill(&self) {}

    /// Trim internal caches after padded prefill. Models with internal
    /// cache state (e.g. NemotronH) override this to trim their own caches
    /// so that padding positions do not corrupt subsequent decode steps.
    fn trim_internal_caches(&self, _excess: i32) {}

    /// Release any model-owned sequence state associated with the provided
    /// external cache slice before the scheduler drops that cache set.
    ///
    /// Used by: Qwen3.5 mixed-cache map cleanup, server batch scheduler
    fn release_sequence_state(&self, _caches: &mut [KVCache]) {}

    /// Prepare model-owned/runtime sequence state before the scheduler starts
    /// using this `SequenceId`.
    fn prepare_sequence_state(&self, _seq_id: SequenceId) {}

    /// Release model-owned/runtime sequence state by its scheduler `SequenceId`.
    fn release_sequence_state_by_id(&self, _seq_id: SequenceId) {}

    /// Describe how one sequence's runtime state should be allocated.
    ///
    /// Phase 0 keeps the default behavior aligned with today's
    /// `supports_batching()` split while giving the control plane an explicit
    /// backend/layout seam for future paged and model-owned sequence state.
    ///
    /// Used by: `CachePool::allocate()`
    fn sequence_state_layout(&self) -> crate::cache::SequenceStateLayout {
        let num_layers = self.num_layers();
        if self.supports_batching() {
            crate::cache::SequenceStateLayout::dense_kv_cache(num_layers)
        } else {
            crate::cache::SequenceStateLayout::model_owned(num_layers)
        }
    }

    /// Whether this model supports tile-aligned padded prefill on M5+ hardware.
    ///
    /// Pure transformer models return `true` (the default) because padding
    /// tokens only affect the external KV cache which is trimmed afterwards.
    /// Hybrid SSM models (NemotronH, Jamba, Mamba, etc.) return `false`
    /// because padding tokens corrupt the internal recurrent state (conv /
    /// SSM state) in a way that cannot be safely trimmed, and the resulting
    /// NaN/inf values can corrupt the Metal GPU state.
    fn supports_padded_prefill(&self) -> bool {
        true
    }

    /// Whether tile-aligned padded prefill can safely use the model's implicit
    /// causal attention path without building an explicit array mask.
    ///
    /// This is only valid for standard causal transformer prefill where:
    /// - padding tokens are appended after the real prompt
    /// - outputs from padded positions are discarded
    /// - external/internal caches are trimmed back to the real prompt length
    ///
    /// Hybrid/recurrent models and models with custom prefill mask semantics
    /// should keep returning `false`.
    fn supports_maskless_padded_prefill(&self) -> bool {
        false
    }

    /// Whether this model supports batched decode for continuous batching.
    ///
    /// Standard transformer models return `true` (the default) because their
    /// state lives entirely in the external `KVCache` slice. SSM and hybrid
    /// models (Mamba, Jamba, NemotronH, etc.) maintain internal recurrent
    /// state that is not compatible with independent per-sequence cache
    /// isolation, so they override this to return `false`.
    ///
    /// Used by: CachePool (to reject unsupported models), server scheduler
    fn supports_batching(&self) -> bool {
        true
    }

    /// Whether the server batch scheduler may use the paged decode backend
    /// for this model family.
    ///
    /// This is stricter than `supports_batching()`: a model can participate in
    /// batched decode while still opting out of paged decode until its
    /// attention path, cache semantics, and operational validation are ready.
    fn supports_paged_decode_backend(&self) -> bool {
        false
    }

    /// Whether this model supports full-sequence batched prefill.
    ///
    /// This is stricter than decode batching. A model may support
    /// `forward_batched()` for `[B, 1]` decode while not supporting
    /// `[B, T]` prompt prefill with shared graph execution.
    ///
    /// The default is `false` so server prefill keeps using the standard
    /// single-sequence path unless a model explicitly opts in with a
    /// true full-prompt batched implementation.
    ///
    /// Used by: BatchScheduler batched prefill gate
    fn supports_batched_prefill(&self) -> bool {
        false
    }

    /// Single-sequence forward with optional scheduler sequence identity.
    fn forward_with_sequence_id(
        &self,
        input_ids: &MlxArray,
        seq_id: Option<SequenceId>,
        caches: &mut [KVCache],
        mask: Option<&MlxArray>,
    ) -> UniquePtr<MlxArray> {
        let _ = seq_id;
        self.forward(input_ids, caches, mask)
    }

    /// Embedding-prefill forward with optional scheduler sequence identity.
    fn forward_with_embeddings_and_sequence_id(
        &self,
        input_ids: &MlxArray,
        input_embeddings: Option<&MlxArray>,
        seq_id: Option<SequenceId>,
        caches: &mut [KVCache],
        mask: Option<&MlxArray>,
    ) -> UniquePtr<MlxArray> {
        let _ = seq_id;
        self.forward_with_embeddings(input_ids, input_embeddings, caches, mask)
    }

    /// Synchronize model-owned sequence storage into the runtime backend state.
    fn sync_sequence_storage(
        &self,
        seq_id: SequenceId,
        cache_pool: &mut CachePool,
    ) -> Result<(), String> {
        cache_pool.sync_paged_state_with_dense(seq_id)
    }

    /// Batched decode with explicit runtime context from the scheduler.
    ///
    /// This extends `forward_batched()` without forcing all model families to
    /// plumb scheduler-specific state through their existing dense path. The
    /// default implementation ignores the context and delegates to
    /// `forward_batched()`.
    ///
    /// Used by: BatchScheduler decode backend dispatch, paged decode profiling
    fn forward_batched_with_context(
        &self,
        input_ids: &MlxArray,
        batch_caches: &mut [&mut [KVCache]],
        mask: Option<&MlxArray>,
        context: Option<&DecodeBatchContext>,
    ) -> UniquePtr<MlxArray> {
        let _ = context;
        self.forward_batched(input_ids, batch_caches, mask)
    }

    /// Batched forward with optional scheduler sequence identities.
    fn forward_batched_with_context_and_ids(
        &self,
        input_ids: &MlxArray,
        seq_ids: Option<&[SequenceId]>,
        batch_caches: &mut [&mut [KVCache]],
        mask: Option<&MlxArray>,
        context: Option<&DecodeBatchContext>,
    ) -> UniquePtr<MlxArray> {
        let _ = seq_ids;
        self.forward_batched_with_context(input_ids, batch_caches, mask, context)
    }

    /// Batched decode: process B sequences in one forward pass.
    ///
    /// `input_ids` has shape `[B, 1]` where B is the batch size (one new
    /// token per active sequence). `batch_caches[i]` is the per-layer KV
    /// cache slice for the i-th sequence.
    ///
    /// Returns logits of shape `[B, 1, vocab_size]`.
    ///
    /// The default implementation falls back to a loop that calls
    /// `forward()` once per sequence and stacks the results. Models that
    /// override this (e.g. Llama3) batch the compute-bound layers
    /// (embedding, norm, FFN) and only run attention per-sequence, which
    /// amortizes weight-loading bandwidth across the batch.
    ///
    /// Used by: BatchScheduler (server continuous batching)
    #[allow(clippy::needless_range_loop)]
    fn forward_batched(
        &self,
        input_ids: &MlxArray,
        batch_caches: &mut [&mut [KVCache]],
        _mask: Option<&MlxArray>,
    ) -> UniquePtr<MlxArray> {
        let b = batch_caches.len();
        if b == 0 {
            return ffi::zeros(&[0, 1, 1], crate::dtype::FLOAT32);
        }
        if b == 1 {
            // Fast path: single sequence, no slicing/stacking overhead
            let logits = self.forward(input_ids, batch_caches[0], None);
            return logits;
        }

        // Default fallback: loop over batch dimension, calling forward()
        // once per sequence and concatenating the results into [B, 1, vocab].
        // Each forward() returns [1, 1, vocab]; concatenate along axis 0
        // yields [B, 1, vocab].
        let token_0 = ffi::slice(input_ids, &[0, 0], &[1, 1]);
        let mut result = self.forward(&token_0, batch_caches[0], None);
        for i in 1..b {
            let token_i = ffi::slice(input_ids, &[i as i32, 0], &[i as i32 + 1, 1]);
            let logits_i = self.forward(&token_i, batch_caches[i], None);
            result = crate::concatenate(&result, &logits_i, 0);
        }
        result
    }
}

/// Decode-time storage backend hint supplied by the runtime.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DecodeStorageBackend {
    Dense,
    Paged,
}

/// Optional scheduler/runtime context for batched decode dispatch.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct DecodeBatchContext {
    pub storage_backend: DecodeStorageBackend,
    pub paged_block_size: i32,
    pub use_native_paged_kernel: bool,
}

impl DecodeBatchContext {
    pub fn dense() -> Self {
        Self {
            storage_backend: DecodeStorageBackend::Dense,
            paged_block_size: 0,
            use_native_paged_kernel: false,
        }
    }

    pub fn paged(block_size: i32) -> Self {
        Self::paged_with_native(block_size, true)
    }

    pub fn paged_with_native(block_size: i32, use_native_paged_kernel: bool) -> Self {
        Self {
            storage_backend: DecodeStorageBackend::Paged,
            paged_block_size: block_size,
            use_native_paged_kernel,
        }
    }

    pub fn is_paged_decode(self) -> bool {
        self.storage_backend == DecodeStorageBackend::Paged && self.paged_block_size > 0
    }
}

/// Sampling configuration
#[derive(Debug, Clone)]
pub struct SamplingConfig {
    /// Temperature for sampling (1.0 = no change)
    pub temperature: f32,
    /// Top-k sampling (0 = disabled)
    pub top_k: i32,
    /// Top-p (nucleus) sampling (1.0 = disabled)
    pub top_p: f32,
    /// Min-p sampling threshold (0.0 = disabled)
    /// Removes tokens with probability < min_p * max_probability
    pub min_p: f32,
    /// Random seed for reproducibility (None = random)
    pub seed: Option<u64>,
    /// Repetition penalty (1.0 = disabled)
    pub repetition_penalty: f32,
    /// DRY multiplier (0.0 = disabled)
    pub dry_multiplier: f32,
    /// DRY exponential base (default: 1.75)
    pub dry_base: f32,
    /// DRY minimum match length before penalty applies (default: 2)
    pub dry_allowed_length: usize,
    /// DRY lookback window (0 = all history)
    pub dry_penalty_last_n: usize,
    /// Token IDs that break DRY matching (e.g., newlines, punctuation)
    pub dry_sequence_breakers: Vec<i32>,
    /// OpenAI-style frequency penalty: subtract penalty * count(token) from logits (0.0 = disabled)
    pub frequency_penalty: f32,
    /// OpenAI-style presence penalty: subtract penalty if token appeared at all (0.0 = disabled)
    pub presence_penalty: f32,
    /// Additional stop token IDs (from generation_config.json or API request)
    /// Merged with model's built-in eos_token_ids during generation
    pub stop_token_ids: Vec<i32>,
    /// Per-token additive logit bias applied before all history-based penalties.
    /// Empty (default) is a zero-overhead no-op that preserves bit-exact baseline.
    pub token_bias: TokenBiasMap,
}

impl Default for SamplingConfig {
    fn default() -> Self {
        Self {
            temperature: 1.0,
            top_k: 0,
            top_p: 1.0,
            min_p: 0.0,
            seed: None,
            repetition_penalty: 1.0,
            dry_multiplier: 0.0,
            dry_base: 1.75,
            dry_allowed_length: 2,
            dry_penalty_last_n: 0,
            dry_sequence_breakers: Vec::new(),
            frequency_penalty: 0.0,
            presence_penalty: 0.0,
            stop_token_ids: Vec::new(),
            token_bias: TokenBiasMap::default(),
        }
    }
}

impl SamplingConfig {
    /// Create greedy sampling config (temperature 0)
    pub fn greedy() -> Self {
        Self {
            temperature: 0.0,
            top_k: 1,
            top_p: 1.0,
            min_p: 0.0,
            seed: None,
            repetition_penalty: 1.0,
            dry_multiplier: 0.0,
            dry_base: 1.75,
            dry_allowed_length: 2,
            dry_penalty_last_n: 0,
            dry_sequence_breakers: Vec::new(),
            frequency_penalty: 0.0,
            presence_penalty: 0.0,
            stop_token_ids: Vec::new(),
            token_bias: TokenBiasMap::default(),
        }
    }

    /// Create config with specific temperature
    pub fn with_temperature(temp: f32) -> Self {
        Self {
            temperature: temp,
            ..Default::default()
        }
    }

    /// Check if any penalty-based sampling is enabled
    pub fn needs_token_history(&self) -> bool {
        self.repetition_penalty != 1.0
            || self.dry_multiplier > 0.0
            || self.frequency_penalty != 0.0
            || self.presence_penalty != 0.0
    }
}

/// Generation statistics
#[derive(Debug, Clone, Default)]
pub struct GenerationStats {
    /// Number of prompt tokens processed
    pub prompt_tokens: usize,
    /// Number of tokens generated
    pub generated_tokens: usize,
    /// Time to process the prompt (prefill) in milliseconds
    pub prefill_time_ms: f64,
    /// Time to generate tokens (decode) in milliseconds
    pub decode_time_ms: f64,
    /// Prefill throughput: prompt tokens per second
    pub prefill_tok_per_sec: f64,
    /// Decode throughput: generated tokens per second
    pub decode_tok_per_sec: f64,
}

impl GenerationStats {
    /// Print formatted stats
    pub fn print(&self) {
        println!("  Prompt tokens:    {}", self.prompt_tokens);
        println!("  Generated tokens: {}", self.generated_tokens);
        println!(
            "  Prefill:          {:.2} ms ({:.2} tok/s)",
            self.prefill_time_ms, self.prefill_tok_per_sec
        );
        println!(
            "  Decode:           {:.2} ms ({:.2} tok/s)",
            self.decode_time_ms, self.decode_tok_per_sec
        );
    }
}

/// Generator state for managing generation
pub struct CxxGenerator {
    caches: Vec<KVCache>,
    generated_tokens: Vec<i32>,
    /// Dedicated thread-local generation stream for pipelining.
    ///
    /// The TLS handle resolves to a per-thread `MlxStream` on demand
    /// when the generator first dispatches work on a worker thread.
    /// This keeps dispatch and synchronization paired even if the
    /// generator is constructed on one thread (e.g. the request
    /// dispatcher) and run on another (e.g. the model worker), per
    /// upstream `mlx-vlm` PR #1050 / mlxcel issue #556.
    generation_stream: Option<UniquePtr<MlxThreadLocalStream>>,
    /// KV cache quantization mode applied to all layer caches.
    /// Default: `KVCacheMode::Fp16` (no quantization).
    kv_cache_mode: KVCacheMode,
    /// Cached per-generator `TokenBiasMap` resolved from a `LangBiasConfig`.
    ///
    /// Populated at construction time via [`Self::with_token_bias`] (or a
    /// `LangBiasConfig`-aware constructor) and re-applied to every
    /// `SamplingConfig` used by the generator's public `generate_*` entry
    /// points. Empty (`TokenBiasMap::default`) is a zero-overhead no-op that
    /// preserves bit-exact baseline behavior, so callers that do not opt into
    /// language steering pay no per-token or per-call cost (see
    /// [`Self::compose_sampling`]).
    ///
    /// Axis B / Epic #362: populated by B8 wiring for the CLI `generate`
    /// path; the server batch scheduler caches its own copy on `BatchScheduler`.
    token_bias: TokenBiasMap,
}

impl CxxGenerator {
    /// Create a new generator with FP16 KV cache (default).
    pub fn new(num_layers: usize) -> Self {
        Self {
            caches: (0..num_layers).map(|_| KVCache::new()).collect(),
            generated_tokens: Vec::new(),
            generation_stream: new_thread_local_generation_stream(),
            kv_cache_mode: KVCacheMode::Fp16,
            token_bias: TokenBiasMap::default(),
        }
    }

    /// Create a new generator with the specified KV cache quantization mode.
    ///
    /// Use `KVCacheMode::Int8` to halve KV cache memory at the cost of
    /// small per-token quantization error.
    ///
    /// When `kv_cache_mode` is one of the `Turbo4*` variants, the
    /// **Boundary-V** policy (B6, issue #478, epic #458) is applied: the
    /// first / last N transformer layers' caches are upgraded to
    /// `KVCacheMode::Fp16` to recover the per-layer V-quantization quality
    /// gap measured in `references/turboquant_plus/docs/papers/
    /// layer-aware-v-compression.md`. The boundary count is read from the
    /// `MLXCEL_KV_BOUNDARY_V_LAYERS` env var (default 2; `0` disables) and
    /// clamped to `n_layers / 2`. For non-Turbo4 modes the policy is inert
    /// — every layer's cache uses `kv_cache_mode` unchanged.
    pub fn new_with_kv_mode(num_layers: usize, kv_cache_mode: KVCacheMode) -> Self {
        let requested = crate::cache::turbo::boundary_v_layers_from_env();
        let layer_modes =
            crate::cache::turbo::resolve_layer_modes(kv_cache_mode, num_layers, requested);
        Self {
            caches: layer_modes
                .into_iter()
                .map(KVCache::new_with_mode)
                .collect(),
            generated_tokens: Vec::new(),
            generation_stream: new_thread_local_generation_stream(),
            kv_cache_mode,
            token_bias: TokenBiasMap::default(),
        }
    }

    /// Attach a pre-resolved `TokenBiasMap` to this generator.
    ///
    /// The bias is cached for the generator's lifetime and merged into every
    /// `SamplingConfig` handed to `generate_*` unless the caller already
    /// supplied a non-empty `token_bias` on the sampling config (caller wins).
    ///
    /// Callers that want to derive the map from a [`crate::LangBiasConfig`]
    /// typically combine this with
    /// [`crate::LangBiasConfig::resolve_token_bias`]:
    ///
    /// ```ignore
    /// let bias = lang_bias_config.resolve_token_bias(tokenizer, bytes)?;
    /// let generator = CxxGenerator::new(layers).with_token_bias(bias);
    /// ```
    ///
    /// When `bias.is_empty()`, this method is a no-op on the sampling path —
    /// the composed `SamplingConfig` is returned by reference and `sample_*`
    /// goes through the existing zero-overhead branch.
    pub fn with_token_bias(mut self, bias: TokenBiasMap) -> Self {
        self.token_bias = bias;
        self
    }

    /// Returns a reference to the cached token-bias map.
    ///
    /// Used by tests to assert that B8 wiring populated the correct map.
    pub fn token_bias(&self) -> &TokenBiasMap {
        &self.token_bias
    }

    /// Compose the effective sampling config from the cached `token_bias` and
    /// the caller's [`SamplingConfig`].
    ///
    /// # Precedence and bit-exact baseline
    /// - If the caller already set a non-empty `sampling.token_bias`, the
    ///   caller's bias wins (returned borrow — zero allocation).
    /// - If the cached `token_bias` is empty, we borrow the caller's config
    ///   unchanged. This is the **baseline no-op path** and is bit-exact
    ///   identical to pre-B8 behavior.
    /// - Otherwise, clone the caller's config and inject the cached bias.
    ///
    /// Used by: `generate`, `generate_streaming`, `generate_with_stats`, and
    /// VLM embedding-aware variants so every generation path observes the
    /// cached bias without duplicating the merge logic.
    fn compose_sampling<'a>(&self, sampling: &'a SamplingConfig) -> Cow<'a, SamplingConfig> {
        if self.token_bias.is_empty() || !sampling.token_bias.is_empty() {
            Cow::Borrowed(sampling)
        } else {
            let mut cloned = sampling.clone();
            cloned.token_bias = self.token_bias.clone();
            Cow::Owned(cloned)
        }
    }

    /// Prepare Turbo4Delegated cache state before decode.
    ///
    /// `max_tokens <= 1` is a prefill-only generation from the cache's point
    /// of view, so skip the handoff fold and keep prefill probes isolated.
    ///
    /// Used by: streaming and stats generation paths immediately after the
    /// first sampled token is materialized or scheduled.
    fn prepare_turbo4_delegated_before_decode(&mut self, max_tokens: usize) {
        if max_tokens <= 1 {
            return;
        }
        for cache in &mut self.caches {
            cache.prepare_turbo4_delegated_for_decode();
        }
    }

    /// Reset generator state
    ///
    /// Must call `reset_with_model` instead when the model uses internal caches
    /// (e.g. Gemma3, Jamba, Mamba, NemotronH, etc.) to ensure those are also reset.
    ///
    /// Preserves the per-layer Boundary-V mode mapping (issue #478, epic #458)
    /// computed at construction time: each layer's pre-existing
    /// `KVCacheMode` (which may differ from `self.kv_cache_mode` for
    /// boundary layers) is reused so quality protection survives a reset.
    pub fn reset(&mut self) {
        for cache in &mut self.caches {
            // Preserve the resolved per-layer mode; the constructor already
            // applied the boundary upgrade where needed and we must not
            // collapse it back to a uniform Turbo4 setup here.
            let layer_mode = cache.mode;
            *cache = KVCache::new_with_mode(layer_mode);
        }
        self.generated_tokens.clear();
    }

    /// Reset generator state including model-internal caches.
    ///
    /// Models with internal RefCell caches (sliding window, SSM, hybrid) reset
    /// their own state inside `make_caches()`. This method ensures both the
    /// generator's cache vector and the model's internal caches are cleared.
    /// The kv_cache_mode is applied to the freshly created caches.
    ///
    /// Honors the Boundary-V policy (issue #478): when `self.kv_cache_mode`
    /// is one of the `Turbo4*` variants, the first / last N caches are
    /// re-resolved to `KVCacheMode::Fp16` instead of the nominal mode.
    /// The boundary count is read from `MLXCEL_KV_BOUNDARY_V_LAYERS` so a
    /// runtime-tuned count is honored on every reset.
    pub fn reset_with_model<M: LanguageModel + ?Sized>(&mut self, model: &M) {
        self.caches = model.make_caches();
        // Apply the configured KV cache mode (with Boundary-V upgrade) to
        // all freshly created caches.
        self.apply_kv_cache_mode_with_boundary_policy();
        self.generated_tokens.clear();
    }

    /// Get mutable access to caches (used by speculative decoding)
    pub fn caches_mut(&mut self) -> &mut [KVCache] {
        &mut self.caches
    }

    /// Apply the configured KV cache mode (with Boundary-V policy) to every
    /// cache slot.
    ///
    /// Called from each generation entry point right after `ensure_model_caches`
    /// rebuilds caches from `model.make_caches()` (which always uses the
    /// default Fp16 mode). Centralizes the per-layer mode resolution so the
    /// Boundary-V policy (issue #478) survives the entire generation lifecycle
    /// including `reset_with_model` boundary cases.
    ///
    /// No-op when `self.kv_cache_mode == Fp16` — every layer is already FP16
    /// so there is nothing to apply.
    fn apply_kv_cache_mode_with_boundary_policy(&mut self) {
        let nominal = self.kv_cache_mode;
        if nominal == KVCacheMode::Fp16 {
            return;
        }
        let n_layers = self.caches.len();
        let requested = crate::cache::turbo::boundary_v_layers_from_env();
        let layer_modes = crate::cache::turbo::resolve_layer_modes(nominal, n_layers, requested);
        for (cache, mode) in self.caches.iter_mut().zip(layer_modes) {
            cache.mode = mode;
        }
    }

    /// Generate tokens from the model (original implementation)
    pub fn generate<M: LanguageModel>(
        &mut self,
        model: &M,
        prompt_tokens: &[i32],
        max_tokens: usize,
        sampling: &SamplingConfig,
    ) -> Vec<i32> {
        self.generate_streaming(model, prompt_tokens, max_tokens, sampling, |_| true)
    }

    /// Streaming generation with per-token callback and lookahead pipelining.
    ///
    /// The callback receives each generated token ID and returns `true` to continue
    /// or `false` to abort early. Pipelining is preserved: next step computation
    /// starts before the current token is returned.
    ///
    /// Used by: CxxGenerator::generate, ModelProvider (server streaming)
    pub fn generate_streaming<M: LanguageModel, F: FnMut(i32) -> bool>(
        &mut self,
        model: &M,
        prompt_tokens: &[i32],
        max_tokens: usize,
        sampling: &SamplingConfig,
        mut on_token: F,
    ) -> Vec<i32> {
        // Reset state
        self.reset();

        // Axis B: merge any generator-cached language-bias map into the
        // sampling config before seeding/penalty evaluation. Empty cached
        // bias => borrowed unchanged (bit-exact baseline; zero alloc).
        let sampling_cow = self.compose_sampling(sampling);
        let sampling = sampling_cow.as_ref();

        // Set random seed if specified (for reproducibility)
        seed_rng_if_needed(sampling);

        // Ensure caches are initialized for this model.
        // `ensure_model_caches` may rebuild caches from `model.make_caches()`
        // (which always uses the default Fp16 mode), so re-apply kv_cache_mode
        // afterwards when a non-default mode is configured.
        ensure_model_caches(&mut self.caches, model);
        // Honor the Boundary-V policy (issue #478) when applying the
        // nominal mode to per-layer caches: the first/last N layers stay
        // at FP16 to recover the V-quantization quality gap.
        self.apply_kv_cache_mode_with_boundary_policy();

        // Set generation stream as default for better pipelining
        install_thread_local_default_stream(self.generation_stream.as_ref());

        // Get EOS tokens for this model
        let eos_tokens = merged_eos_token_ids(model.eos_token_ids(), &sampling.stop_token_ids);

        // Hoist env var checks out of the hot loop to avoid per-token syscalls.
        let trace_dtype = std::env::var("MLXCEL_TRACE_DTYPE").is_ok();
        let force_sync = std::env::var("MLXCEL_FORCE_SYNC").is_ok();
        let profile_pipeline = std::env::var("MLXCEL_PROFILE_PIPELINE").is_ok();
        let profile_pipeline_detail = std::env::var("MLXCEL_PROFILE_PIPELINE_DETAIL").is_ok();

        // Prefill: process all prompt tokens at once.
        // On M5+ hardware pad the sequence to a 32-token tile boundary for
        // optimal Neural Accelerator throughput.
        let actual_len = prompt_tokens.len();
        let logits = if should_align_prefill() && model.supports_padded_prefill() {
            let padded_len = align_to_na_tile(actual_len);
            let (padded_tokens, mask_opt) = pad_tokens_for_prefill(
                prompt_tokens,
                padded_len,
                model.supports_maskless_padded_prefill(),
            );
            let input = ffi::from_slice_i32(&padded_tokens, &[1, padded_len as i32]);
            let raw_logits = model.forward(
                &input,
                &mut self.caches,
                mask_opt.as_ref().map(|m| m.as_ref().unwrap()),
            );
            // Trim padding positions from all KV caches so decode uses the
            // correct cache offset (actual_len, not padded_len).
            if padded_len > actual_len {
                trim_caches_to_actual_len(&mut self.caches, actual_len, padded_len);
                model.trim_internal_caches((padded_len - actual_len) as i32);
                // Extract logits at the last real token position.
                logits_at_position(&raw_logits, actual_len - 1)
            } else {
                // No padding was needed (already aligned).
                raw_logits
            }
        } else {
            let input = ffi::from_slice_i32(prompt_tokens, &[1, actual_len as i32]);
            model.forward(&input, &mut self.caches, None)
        };

        if trace_dtype {
            ffi::eval(&logits);
            let shape = ffi::array_shape(&logits);
            eprintln!(
                "[LOGITS] prefill dtype={} shape={:?}",
                ffi::array_dtype(&logits),
                shape
            );
        }

        // Clear intermediate tensors from prefill to free memory
        ffi::clear_memory_cache();

        // Build token history from prompt for penalty-based sampling
        let needs_history = sampling.needs_token_history();
        let mut token_history = initial_token_history(prompt_tokens, needs_history);

        // Sample first token (logits already sliced to last real position when padded)
        let (mut y, mut _logprobs) = sample_token_optimized(&logits, sampling, &token_history);
        ffi::async_eval(&y);
        self.prepare_turbo4_delegated_before_decode(max_tokens);

        // Main generation loop - matches Python exactly:
        // 1. Start next step computation
        // 2. async_eval next step
        // 3. Extract current value (syncs current only)
        // 4. Yield/store current
        // 5. Move next to current
        let mut build_ns_total = 0u128;
        let mut wait_ns_total = 0u128;
        let mut reshape_ns_total = 0u128;
        let mut forward_ns_total = 0u128;
        let mut sample_ns_total = 0u128;
        let mut async_eval_ns_total = 0u128;
        let mut profile_count = 0u32;

        let mut n = 0;
        loop {
            // Start next step (if not at max)
            let build_start = if profile_pipeline {
                Some(std::time::Instant::now())
            } else {
                None
            };

            let (next_y, next_logprobs) = if n + 1 < max_tokens {
                let detail_start = if profile_pipeline_detail {
                    Some(std::time::Instant::now())
                } else {
                    None
                };
                let next_input = ffi::reshape_token_for_forward(&y);
                if let Some(start) = detail_start {
                    reshape_ns_total += start.elapsed().as_nanos();
                }
                let detail_start = if profile_pipeline_detail {
                    Some(std::time::Instant::now())
                } else {
                    None
                };
                let next_logits = model.forward(&next_input, &mut self.caches, None);
                if let Some(start) = detail_start {
                    forward_ns_total += start.elapsed().as_nanos();
                }
                if trace_dtype && n == 0 {
                    ffi::eval(&next_logits);
                    eprintln!("[LOGITS] decode dtype={}", ffi::array_dtype(&next_logits));
                }
                let detail_start = if profile_pipeline_detail {
                    Some(std::time::Instant::now())
                } else {
                    None
                };
                let (next_tok, next_log) =
                    sample_token_optimized(&next_logits, sampling, &token_history);
                if let Some(start) = detail_start {
                    sample_ns_total += start.elapsed().as_nanos();