mlxcel/src/lib/mlxcel-core/cpp/mlx_cxx_bridge.h at 74850527a2c58a644b353f126df182e3a049976f · lablup/mlxcel · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright 2025 mlx-lm-rs authors
// Direct C++ bridge for MLX via cxx

#pragma once

#include <memory>
#include <cstdint>
#include "rust/cxx.h"
#include "mlx/mlx.h"

namespace mlx_cxx {

// Opaque wrapper struct to hold mlx::core::array
// This allows cxx to manage the lifetime without exposing the complex internals
struct MlxArray {
    mlx::core::array inner;

    explicit MlxArray(mlx::core::array&& arr) : inner(std::move(arr)) {}
    explicit MlxArray(const mlx::core::array& arr) : inner(arr) {}
};

// Opaque wrapper for mlx::core::Stream
struct MlxStream {
    mlx::core::Stream inner;

    explicit MlxStream(mlx::core::Stream s) : inner(s) {}
};

// Opaque wrapper for mlx::core::ThreadLocalStream.
//
// A `ThreadLocalStream` is a stream-like handle whose physical
// `mlx::core::Stream` is resolved per-thread on demand via
// `mlx::core::stream_from_thread_local_stream`. Holding the same handle
// across threads gives every thread its own dedicated MLX stream
// without any explicit coordination between them. Used by
// `mlxcel-core` to back the generation stream of `BatchScheduler`,
// `CxxGenerator`, and `SpeculativeGenerator` (issue #556 / upstream
// MLX commit `728fab1` in mlx-vlm PR #1050).
struct MlxThreadLocalStream {
    mlx::core::ThreadLocalStream inner;

    explicit MlxThreadLocalStream(mlx::core::ThreadLocalStream s) : inner(s) {}
};


// Stream functions.
std::unique_ptr<MlxStream> default_stream();
std::unique_ptr<MlxStream> new_stream_on_device(bool gpu);
void synchronize_stream(const MlxStream& stream);

// Thread-local stream factory bound to the GPU device.
//
// The returned handle is safe to share across threads: each calling
// thread sees its own per-thread MLX stream when it calls
// `stream_from_thread_local_stream`. Used by the generation stream
// owners so that decoding and synchronization always happen on the
// same per-thread stream, even if the owner is later moved between
// threads.
std::unique_ptr<MlxThreadLocalStream> new_thread_local_stream_gpu();

// Resolve the calling thread's `MlxStream` from a thread-local handle.
//
// Each calling thread receives its own `mlx::core::Stream` for the
// device the handle was created on. The same handle returns the same
// per-thread stream across calls on that thread.
std::unique_ptr<MlxStream> stream_from_thread_local_stream(const MlxThreadLocalStream& tls);

// Synchronize the calling thread's stream associated with this handle.
//
// Equivalent to resolving the handle and calling `synchronize_stream`,
// but goes through MLX's `synchronize(ThreadLocalStream)` overload so
// that synchronization is bound to the same per-thread stream that
// dispatched the work.
void synchronize_thread_local_stream(const MlxThreadLocalStream& tls);

// Array factory functions.
// Create array filled with zeros
std::unique_ptr<MlxArray> zeros(rust::Slice<const int32_t> shape, int32_t dtype);
std::unique_ptr<MlxArray> zeros_stream(rust::Slice<const int32_t> shape, int32_t dtype, const MlxStream& stream);

// Create array filled with ones
std::unique_ptr<MlxArray> ones(rust::Slice<const int32_t> shape, int32_t dtype);
std::unique_ptr<MlxArray> ones_stream(rust::Slice<const int32_t> shape, int32_t dtype, const MlxStream& stream);

// Create array with specific value
std::unique_ptr<MlxArray> full_f32(rust::Slice<const int32_t> shape, float value, int32_t dtype);

// Create identity/eye matrix
std::unique_ptr<MlxArray> eye(int32_t n, int32_t m, int32_t k, int32_t dtype);

// Create linearly spaced values
std::unique_ptr<MlxArray> linspace(float start, float stop, int32_t num, int32_t dtype);

// Create arrays with same shape as input
std::unique_ptr<MlxArray> zeros_like(const MlxArray& a);
std::unique_ptr<MlxArray> ones_like(const MlxArray& a);
std::unique_ptr<MlxArray> full_like(const MlxArray& a, float value);

// Create array from data
std::unique_ptr<MlxArray> from_slice_f32(rust::Slice<const float> data, rust::Slice<const int32_t> shape);
std::unique_ptr<MlxArray> from_slice_i32(rust::Slice<const int32_t> data, rust::Slice<const int32_t> shape);
std::unique_ptr<MlxArray> from_slice_u32(rust::Slice<const uint32_t> data, rust::Slice<const int32_t> shape);
std::unique_ptr<MlxArray> from_slice_i64(rust::Slice<const int64_t> data, rust::Slice<const int32_t> shape);

// Create array from raw bytes with specified dtype
std::unique_ptr<MlxArray> from_bytes(rust::Slice<const uint8_t> data, rust::Slice<const int32_t> shape, int32_t dtype);
std::unique_ptr<MlxArray> from_bytes_nocopy(rust::Slice<const uint8_t> data, rust::Slice<const int32_t> shape, int32_t dtype);

// Create half-precision array from raw bytes
std::unique_ptr<MlxArray> from_bytes_f16(rust::Slice<const uint8_t> data, rust::Slice<const int32_t> shape, bool bfloat16);

// Array property accessors.
rust::Vec<int32_t> array_shape(const MlxArray& arr);
int32_t array_dtype(const MlxArray& arr);
size_t array_size(const MlxArray& arr);
size_t array_ndim(const MlxArray& arr);
size_t array_itemsize(const MlxArray& arr);
size_t array_nbytes(const MlxArray& arr);

// Array data access (scalar extraction).
float item_f32(const MlxArray& arr);
int32_t item_i32(const MlxArray& arr);
int64_t item_i64(const MlxArray& arr);
bool item_bool(const MlxArray& arr);

// Copy evaluated array data to a byte buffer.
// Used by: KV cache serialization for disaggregated inference
rust::Vec<uint8_t> array_to_raw_bytes(const MlxArray& arr);

// Evaluation.
void eval(const MlxArray& arr);
void eval_all(rust::Slice<const MlxArray* const> arrays);

// Element-wise binary operations.
std::unique_ptr<MlxArray> add(const MlxArray& a, const MlxArray& b);
std::unique_ptr<MlxArray> subtract(const MlxArray& a, const MlxArray& b);
std::unique_ptr<MlxArray> remainder(const MlxArray& a, const MlxArray& b);
std::unique_ptr<MlxArray> multiply(const MlxArray& a, const MlxArray& b);
std::unique_ptr<MlxArray> divide(const MlxArray& a, const MlxArray& b);
std::unique_ptr<MlxArray> maximum(const MlxArray& a, const MlxArray& b);
std::unique_ptr<MlxArray> minimum(const MlxArray& a, const MlxArray& b);

// Element-wise unary operations.
std::unique_ptr<MlxArray> negative(const MlxArray& a);
std::unique_ptr<MlxArray> abs(const MlxArray& a);
std::unique_ptr<MlxArray> exp(const MlxArray& a);
std::unique_ptr<MlxArray> log(const MlxArray& a);
std::unique_ptr<MlxArray> sqrt(const MlxArray& a);
std::unique_ptr<MlxArray> rsqrt(const MlxArray& a);
std::unique_ptr<MlxArray> square(const MlxArray& a);
std::unique_ptr<MlxArray> sin(const MlxArray& a);
std::unique_ptr<MlxArray> cos(const MlxArray& a);
std::unique_ptr<MlxArray> tanh(const MlxArray& a);
std::unique_ptr<MlxArray> sigmoid(const MlxArray& a);
std::unique_ptr<MlxArray> floor(const MlxArray& a);
std::unique_ptr<MlxArray> ceil(const MlxArray& a);
std::unique_ptr<MlxArray> round(const MlxArray& a);
std::unique_ptr<MlxArray> sign(const MlxArray& a);
std::unique_ptr<MlxArray> reciprocal(const MlxArray& a);

// Trigonometric functions
std::unique_ptr<MlxArray> tan(const MlxArray& a);
std::unique_ptr<MlxArray> sinh(const MlxArray& a);
std::unique_ptr<MlxArray> cosh(const MlxArray& a);
std::unique_ptr<MlxArray> arcsin(const MlxArray& a);
std::unique_ptr<MlxArray> arccos(const MlxArray& a);
std::unique_ptr<MlxArray> arctan(const MlxArray& a);
std::unique_ptr<MlxArray> arctan2(const MlxArray& a, const MlxArray& b);
std::unique_ptr<MlxArray> arcsinh(const MlxArray& a);
std::unique_ptr<MlxArray> arccosh(const MlxArray& a);
std::unique_ptr<MlxArray> arctanh(const MlxArray& a);
std::unique_ptr<MlxArray> degrees(const MlxArray& a);
std::unique_ptr<MlxArray> radians(const MlxArray& a);

// Mathematical/Special functions
std::unique_ptr<MlxArray> erf(const MlxArray& a);
std::unique_ptr<MlxArray> erfinv(const MlxArray& a);
std::unique_ptr<MlxArray> expm1(const MlxArray& a);
std::unique_ptr<MlxArray> log2(const MlxArray& a);
std::unique_ptr<MlxArray> log10(const MlxArray& a);
std::unique_ptr<MlxArray> logaddexp(const MlxArray& a, const MlxArray& b);
std::unique_ptr<MlxArray> power(const MlxArray& a, const MlxArray& b);

// Checks
std::unique_ptr<MlxArray> isnan(const MlxArray& a);
std::unique_ptr<MlxArray> isinf(const MlxArray& a);
std::unique_ptr<MlxArray> isfinite(const MlxArray& a);
std::unique_ptr<MlxArray> isneginf(const MlxArray& a);
std::unique_ptr<MlxArray> isposinf(const MlxArray& a);

// Reduction operations.
std::unique_ptr<MlxArray> sum_all(const MlxArray& a);
std::unique_ptr<MlxArray> sum_axis(const MlxArray& a, int32_t axis, bool keepdims);
std::unique_ptr<MlxArray> mean_all(const MlxArray& a);
std::unique_ptr<MlxArray> mean_axis(const MlxArray& a, int32_t axis, bool keepdims);
std::unique_ptr<MlxArray> max_all(const MlxArray& a);
std::unique_ptr<MlxArray> max_axis(const MlxArray& a, int32_t axis, bool keepdims);
std::unique_ptr<MlxArray> min_all(const MlxArray& a);
std::unique_ptr<MlxArray> min_axis(const MlxArray& a, int32_t axis, bool keepdims);

// Product reduction
std::unique_ptr<MlxArray> prod_all(const MlxArray& a);
std::unique_ptr<MlxArray> prod_axis(const MlxArray& a, int32_t axis, bool keepdims);

// Variance and standard deviation
std::unique_ptr<MlxArray> var_all(const MlxArray& a);
std::unique_ptr<MlxArray> var_axis(const MlxArray& a, int32_t axis, bool keepdims, int32_t ddof);
std::unique_ptr<MlxArray> std_all(const MlxArray& a);
std::unique_ptr<MlxArray> std_axis(const MlxArray& a, int32_t axis, bool keepdims, int32_t ddof);

// Logsumexp
std::unique_ptr<MlxArray> logsumexp_all(const MlxArray& a);
std::unique_ptr<MlxArray> logsumexp_axis(const MlxArray& a, int32_t axis, bool keepdims);

// All/any reductions
std::unique_ptr<MlxArray> all_all(const MlxArray& a);
std::unique_ptr<MlxArray> any_all(const MlxArray& a);

// Matrix operations.
std::unique_ptr<MlxArray> matmul(const MlxArray& a, const MlxArray& b);
std::unique_ptr<MlxArray> transpose(const MlxArray& a);
std::unique_ptr<MlxArray> transpose_axes(const MlxArray& a, rust::Slice<const int32_t> axes);
std::unique_ptr<MlxArray> reshape(const MlxArray& a, rust::Slice<const int32_t> shape);

// Shape operations.
std::unique_ptr<MlxArray> expand_dims(const MlxArray& a, int32_t axis);
std::unique_ptr<MlxArray> expand_dims_multi(
    const MlxArray& a,
    rust::Slice<const int32_t> axes);
std::unique_ptr<MlxArray> squeeze(const MlxArray& a);
std::unique_ptr<MlxArray> squeeze_axis(const MlxArray& a, int32_t axis);
std::unique_ptr<MlxArray> broadcast_to(const MlxArray& a, rust::Slice<const int32_t> shape);

// Flatten array
std::unique_ptr<MlxArray> flatten(const MlxArray& a);
std::unique_ptr<MlxArray> flatten_range(const MlxArray& a, int32_t start_axis, int32_t end_axis);

// Move axis
std::unique_ptr<MlxArray> moveaxis(const MlxArray& a, int32_t source, int32_t destination);

// Pad array
std::unique_ptr<MlxArray> pad(const MlxArray& a, rust::Slice<const int32_t> pad_width, float pad_value);

// Split array at indices
std::unique_ptr<MlxArray> split_at_indices(const MlxArray& a, rust::Slice<const int32_t> indices, int32_t axis);

// Diagonal operations
std::unique_ptr<MlxArray> diag(const MlxArray& a, int32_t k);
std::unique_ptr<MlxArray> diagonal(const MlxArray& a, int32_t offset, int32_t axis1, int32_t axis2);

// Type conversion.
std::unique_ptr<MlxArray> astype(const MlxArray& a, int32_t dtype);

// Copy.
std::unique_ptr<MlxArray> copy(const MlxArray& a);

// High-level operations for LLM inference.
// Softmax along axis
std::unique_ptr<MlxArray> softmax(const MlxArray& a, int32_t axis);

// Softmax along axis with precise=true (f32 accumulation for f16 inputs)
std::unique_ptr<MlxArray> softmax_precise(const MlxArray& a, int32_t axis);

// Log-softmax along axis (numerically stable)
std::unique_ptr<MlxArray> log_softmax(const MlxArray& a, int32_t axis);

// RMS normalization
std::unique_ptr<MlxArray> rms_norm(const MlxArray& x, const MlxArray& weight, float eps);

// Layer normalization
std::unique_ptr<MlxArray> layer_norm(const MlxArray& x, const MlxArray& weight,
                                     const MlxArray& bias, float eps);

// Concatenate arrays along axis
std::unique_ptr<MlxArray> concatenate(rust::Slice<const MlxArray* const> arrays, int32_t axis);

// Split array into multiple parts
rust::Vec<std::unique_ptr<MlxArray>> split(const MlxArray& a, int32_t num_splits, int32_t axis);

// Slice array with start, stop, step
std::unique_ptr<MlxArray> slice(const MlxArray& a,
                                rust::Slice<const int32_t> starts,
                                rust::Slice<const int32_t> stops);

// Slice update: src[starts:stops] = update (for in-place KV cache updates)
// Returns a new array with the update applied
std::unique_ptr<MlxArray> slice_update(const MlxArray& src,
                                        const MlxArray& update,
                                        rust::Slice<const int32_t> starts,
                                        rust::Slice<const int32_t> stops);

// Argmax along axis
std::unique_ptr<MlxArray> argmax(const MlxArray& a, int32_t axis, bool keepdims);

// Where (conditional select)
std::unique_ptr<MlxArray> where_cond(const MlxArray& condition, const MlxArray& x, const MlxArray& y);

// Comparison operations
std::unique_ptr<MlxArray> greater(const MlxArray& a, const MlxArray& b);
std::unique_ptr<MlxArray> less(const MlxArray& a, const MlxArray& b);
std::unique_ptr<MlxArray> equal(const MlxArray& a, const MlxArray& b);

// Seed the global MLX random number generator
void random_seed(uint64_t seed);

// Random categorical sampling
std::unique_ptr<MlxArray> random_categorical(const MlxArray& logits, int32_t axis);

// Transformer-specific high-level operations (reduces FFI calls).
// Rotary position embedding (RoPE)
// Returns (cos, sin) for position embedding
std::unique_ptr<MlxArray> rope_forward(
    const MlxArray& x,
    int32_t head_dim,
    float theta,
    int32_t offset,
    bool traditional
);

// Apply rotary embedding to query/key
std::unique_ptr<MlxArray> apply_rope(
    const MlxArray& x,
    const MlxArray& cos,
    const MlxArray& sin
);

// Scaled dot-product attention (entire attention computation in one call)
// q: [batch, n_heads, seq_len, head_dim]
// k: [batch, n_kv_heads, seq_len, head_dim]
// v: [batch, n_kv_heads, seq_len, head_dim]
// mask: optional attention mask
// scale: attention scale factor
std::unique_ptr<MlxArray> scaled_dot_product_attention(
    const MlxArray& q,
    const MlxArray& k,
    const MlxArray& v,
    float scale,
    const MlxArray* mask  // nullable
);

// Linear layer forward (with optional bias)
std::unique_ptr<MlxArray> linear_forward(
    const MlxArray& x,
    const MlxArray& weight,
    const MlxArray* bias  // nullable
);

// Quantized linear layer forward
// biases: nullable for mxfp4/nvfp4/mxfp8 modes (no per-group bias)
std::unique_ptr<MlxArray> quantized_linear_forward(
    const MlxArray& x,
    const MlxArray& weight,
    const MlxArray& scales,
    const MlxArray* biases,       // nullable for mxfp4/nvfp4/mxfp8
    const MlxArray* linear_bias,  // nullable
    int32_t group_size,
    int32_t bits,
    rust::Str mode
);

// SwiGLU MLP forward (common in LLMs like Llama)
// output = down_proj(silu(gate_proj(x)) * up_proj(x))
std::unique_ptr<MlxArray> swiglu_mlp_forward(
    const MlxArray& x,
    const MlxArray& gate_proj,
    const MlxArray& up_proj,
    const MlxArray& down_proj
);

// Compiled relu_squared: square(maximum(x, 0)) — single fused kernel
std::unique_ptr<MlxArray> compiled_relu_squared(const MlxArray& x);

// Compiled silu: x * sigmoid(x) — single fused kernel
std::unique_ptr<MlxArray> compiled_silu(const MlxArray& x);

// Compiled gelu: x * 0.5 * (1 + erf(x / sqrt(2))) — single fused kernel
// Used by: Gemma2, Gemma3, StarCoder2, and other GELU-based models
std::unique_ptr<MlxArray> compiled_gelu(const MlxArray& x);

// Compiled gelu_approx: erf-based GELU (x * 0.5 * (1 + erf(x / sqrt(2)))) — fused kernel
// Uses erf instead of tanh for numerical stability with bf16 inputs.
// Used by: Gemma2, Gemma3 (matches Python nn.gelu_approx)
std::unique_ptr<MlxArray> compiled_gelu_approx(const MlxArray& x);

// Compiled gelu_topk: sparse GELU with dynamic threshold — single fused kernel
// gelu_approx(max(0, x - (mean + std * multiplier)))
// Used by: Gemma3n MLP layers with activation_sparsity > 0
std::unique_ptr<MlxArray> compiled_gelu_topk(
    const MlxArray& x,
    float std_multiplier
);

// SwiGLU activation only - compiled with kernel fusion (shapeless=true)
// output = silu(gate) * x
// Uses mlx::core::compile for kernel fusion (like Python's @mx.compile)
std::unique_ptr<MlxArray> compiled_swiglu_activation(
    const MlxArray& gate,
    const MlxArray& x
);

// GptOss SwiGLU activation only - compiled with kernel fusion (shapeless=true)
// output = clipped_gate * sigmoid(1.702 * clipped_gate) * (clipped_up + 1)
// Used by: GptOss
std::unique_ptr<MlxArray> compiled_gpt_oss_swiglu_activation(
    const MlxArray& x_linear,
    const MlxArray& x_glu
);

// GeGLU activation - compiled with kernel fusion (shapeless=true)
// output = gelu(gate) * x
// Used by: Gemma, Gemma2, Gemma3 MLP layers
std::unique_ptr<MlxArray> compiled_geglu_activation(
    const MlxArray& gate,
    const MlxArray& x
);

// GeGLU activation with Python MLX tanh-approx GELU.
// output = gelu_approx(gate) * x
// Used by: Gemma4 MLP and SwitchGeGLU layers
std::unique_ptr<MlxArray> compiled_geglu_approx_activation(
    const MlxArray& gate,
    const MlxArray& x
);

// Compiled softcap attention scores: tanh(scores * inv_cap) * cap
// Fuses divide + tanh + multiply into single compiled kernel
// Used by: Gemma2 attention with logit softcapping
std::unique_ptr<MlxArray> compiled_softcap(
    const MlxArray& scores,
    float cap
);

// Compiled clip_residual for float16 overflow prevention
// When float16: cast to f32, add, clip to f16 range, cast back
// When other dtype: simple addition
// Used by: Gemma3 residual connections
std::unique_ptr<MlxArray> compiled_clip_residual(
    const MlxArray& x,
    const MlxArray& y
);

// Softcap SDPA: Q@K^T * scale -> softcap -> mask -> softmax -> @V
// Combines the entire manual attention path into one compiled call
// Used by: Gemma2 attention with logit softcapping
std::unique_ptr<MlxArray> compiled_softcap_sdpa(
    const MlxArray& q,
    const MlxArray& k,
    const MlxArray& v,
    float scale,
    float softcap,
    const MlxArray* mask
);

// Softcap SDPA with GQA: handles repeat_kv + attention in compiled graph
// Avoids separate repeat_kv FFI calls by incorporating GQA internally
// Used by: Gemma2 attention (GQA + softcap)
std::unique_ptr<MlxArray> compiled_softcap_sdpa_gqa(
    const MlxArray& q,
    const MlxArray& k,
    const MlxArray& v,
    float scale,
    float softcap,
    int32_t n_rep,
    const MlxArray* mask
);

// Compiled GELU MLP forward: down_proj(gelu(gate_proj(x)) * up_proj(x))
// Fuses gate_proj + gelu + up_proj + multiply + down_proj into compiled graph
// Used by: Gemma2, Gemma3 and other GELU-gated MLP models
std::unique_ptr<MlxArray> compiled_gelu_mlp_forward(
    const MlxArray& x,
    const MlxArray& gate_proj,
    const MlxArray& gate_scales,
    const MlxArray* gate_biases,
    const MlxArray& up_proj,
    const MlxArray& up_scales,
    const MlxArray* up_biases,
    const MlxArray& down_proj,
    const MlxArray& down_scales,
    const MlxArray* down_biases,
    int32_t group_size,
    int32_t bits,
    rust::Str mode
);

// Compiled GELU-approx MLP forward: down_proj(gelu_approx(gate_proj(x)) * up_proj(x))
// Fuses the quantized projections and Python MLX tanh-approx GeGLU.
// Used by: Gemma4 dense MLP
std::unique_ptr<MlxArray> compiled_gelu_approx_mlp_forward(
    const MlxArray& x,
    const MlxArray& gate_proj,
    const MlxArray& gate_scales,
    const MlxArray* gate_biases,
    const MlxArray& up_proj,
    const MlxArray& up_scales,
    const MlxArray* up_biases,
    const MlxArray& down_proj,
    const MlxArray& down_scales,
    const MlxArray* down_biases,
    int32_t group_size,
    int32_t bits,
    rust::Str mode
);

// Compiled GeGLU SwitchGLU MLP forward for quantized MoE experts.
// Wraps three `gather_qmm` calls (gate/up/down) plus a tanh-approx
// GeGLU activation into a single `mx::core::compile` window so MLX
// can schedule gate/up in parallel and fuse the intermediate
// element-wise ops. Only the no-sort path is fused; callers should
// fall back to separate `gather_qmm` calls when `sorted_indices` is
// true (prefill). Used by: Gemma 4 26B-a4b SwitchGeGLU experts.
std::unique_ptr<MlxArray> compiled_switch_qgeglu_forward(
    const MlxArray& x,
    const MlxArray& gate_w,
    const MlxArray& gate_s,
    const MlxArray* gate_b,
    const MlxArray& up_w,
    const MlxArray& up_s,
    const MlxArray* up_b,
    const MlxArray& down_w,
    const MlxArray& down_s,
    const MlxArray* down_b,
    const MlxArray& rhs_indices,
    int32_t group_size,
    int32_t bits,
    rust::Str mode
);

// Compiled SwiGLU MLP forward for non-quantized (FP16/BF16) weights:
//   down_proj(silu(gate_proj(x)) * up_proj(x))
// Fuses gate_proj + silu + up_proj + multiply + down_proj into compiled graph.
// Used by: Llama, Qwen2, Qwen3, Mistral and other SwiGLU FP models
std::unique_ptr<MlxArray> compiled_swiglu_mlp_forward_fp16(
    const MlxArray& x,
    const MlxArray& gate_weight,
    const MlxArray& up_weight,
    const MlxArray& down_weight,
    const MlxArray* gate_bias,
    const MlxArray* up_bias,
    const MlxArray* down_bias
);

// Compiled GELU MLP forward for non-quantized (FP16/BF16) weights:
//   down_proj(gelu(gate_proj(x)) * up_proj(x))
// Fuses gate_proj + gelu + up_proj + multiply + down_proj into compiled graph.
// Used by: Gemma2, Gemma3, StarCoder2 and other GELU-gated FP models
std::unique_ptr<MlxArray> compiled_gelu_mlp_forward_fp16(
    const MlxArray& x,
    const MlxArray& gate_weight,
    const MlxArray& up_weight,
    const MlxArray& down_weight,
    const MlxArray* gate_bias,
    const MlxArray* up_bias,
    const MlxArray* down_bias
);

// Full transformer layer forward (maximum FFI reduction)
// Combines: attention + MLP + residuals + norms
std::unique_ptr<MlxArray> transformer_layer_forward(
    const MlxArray& x,
    const MlxArray& attn_norm_weight,
    const MlxArray& q_proj,
    const MlxArray& k_proj,
    const MlxArray& v_proj,
    const MlxArray& o_proj,
    const MlxArray& ffn_norm_weight,
    const MlxArray& gate_proj,
    const MlxArray& up_proj,
    const MlxArray& down_proj,
    const MlxArray* kv_cache_k,  // nullable for first token
    const MlxArray* kv_cache_v,  // nullable for first token
    int32_t n_heads,
    int32_t n_kv_heads,
    int32_t head_dim,
    float rope_theta,
    int32_t rope_offset,
    float norm_eps
);

// Advanced indexing operations.
// Take elements along an axis using indices
std::unique_ptr<MlxArray> take(const MlxArray& a, const MlxArray& indices, int32_t axis);

// Gather elements using indices (multi-dimensional indexing)
// indices can be a vector of index arrays for each dimension
std::unique_ptr<MlxArray> gather(
    const MlxArray& a,
    rust::Slice<const MlxArray* const> indices,
    rust::Slice<const int32_t> axes,
    rust::Slice<const int32_t> slice_sizes
);

// Take along axis (like numpy.take_along_axis)
std::unique_ptr<MlxArray> take_along_axis(const MlxArray& a, const MlxArray& indices, int32_t axis);

// Put along axis (scatter update)
std::unique_ptr<MlxArray> put_along_axis(const MlxArray& a, const MlxArray& indices,
                                          const MlxArray& values, int32_t axis);

// Stack arrays along new axis
std::unique_ptr<MlxArray> stack(rust::Slice<const MlxArray* const> arrays, int32_t axis);

// Tile/repeat array
std::unique_ptr<MlxArray> tile(const MlxArray& a, rust::Slice<const int32_t> reps);
std::unique_ptr<MlxArray> repeat(const MlxArray& a, int32_t repeats, int32_t axis);

// Arange
std::unique_ptr<MlxArray> arange_f32(float start, float stop, float step);
std::unique_ptr<MlxArray> arange_i32(int32_t start, int32_t stop, int32_t step);

// Logical operations.
std::unique_ptr<MlxArray> logical_not(const MlxArray& a);
std::unique_ptr<MlxArray> logical_and(const MlxArray& a, const MlxArray& b);
std::unique_ptr<MlxArray> logical_or(const MlxArray& a, const MlxArray& b);
std::unique_ptr<MlxArray> all_axis(const MlxArray& a, int32_t axis, bool keepdims);
std::unique_ptr<MlxArray> any_axis(const MlxArray& a, int32_t axis, bool keepdims);
std::unique_ptr<MlxArray> greater_equal(const MlxArray& a, const MlxArray& b);
std::unique_ptr<MlxArray> less_equal(const MlxArray& a, const MlxArray& b);
std::unique_ptr<MlxArray> not_equal(const MlxArray& a, const MlxArray& b);

// Activation functions.
std::unique_ptr<MlxArray> silu(const MlxArray& a);
std::unique_ptr<MlxArray> gelu(const MlxArray& a);
std::unique_ptr<MlxArray> gelu_approx(const MlxArray& a);
std::unique_ptr<MlxArray> relu(const MlxArray& a);
std::unique_ptr<MlxArray> leaky_relu(const MlxArray& a, float negative_slope);

// Sorting and searching.
std::unique_ptr<MlxArray> argsort(const MlxArray& a, int32_t axis);
std::unique_ptr<MlxArray> argpartition(const MlxArray& a, int32_t kth, int32_t axis);
std::unique_ptr<MlxArray> argmin(const MlxArray& a, int32_t axis, bool keepdims);
std::unique_ptr<MlxArray> topk(const MlxArray& a, int32_t k, int32_t axis);

// Sort and partition
std::unique_ptr<MlxArray> sort(const MlxArray& a, int32_t axis);
std::unique_ptr<MlxArray> partition(const MlxArray& a, int32_t kth, int32_t axis);

// Cumulative operations
std::unique_ptr<MlxArray> cummax(const MlxArray& a, int32_t axis, bool reverse, bool inclusive);
std::unique_ptr<MlxArray> cummin(const MlxArray& a, int32_t axis, bool reverse, bool inclusive);
std::unique_ptr<MlxArray> cumprod(const MlxArray& a, int32_t axis, bool reverse, bool inclusive);

// Scatter operations
std::unique_ptr<MlxArray> scatter(const MlxArray& a, const MlxArray& indices, const MlxArray& updates, int32_t axis);
std::unique_ptr<MlxArray> scatter_add(const MlxArray& a, const MlxArray& indices, const MlxArray& updates, int32_t axis);
std::unique_ptr<MlxArray> scatter_max(const MlxArray& a, const MlxArray& indices, const MlxArray& updates, int32_t axis);
std::unique_ptr<MlxArray> scatter_min(const MlxArray& a, const MlxArray& indices, const MlxArray& updates, int32_t axis);
std::unique_ptr<MlxArray> scatter_prod(const MlxArray& a, const MlxArray& indices, const MlxArray& updates, int32_t axis);

// Bitwise operations
std::unique_ptr<MlxArray> bitwise_and(const MlxArray& a, const MlxArray& b);
std::unique_ptr<MlxArray> bitwise_or(const MlxArray& a, const MlxArray& b);
std::unique_ptr<MlxArray> bitwise_xor(const MlxArray& a, const MlxArray& b);
std::unique_ptr<MlxArray> left_shift(const MlxArray& a, const MlxArray& b);
std::unique_ptr<MlxArray> right_shift(const MlxArray& a, const MlxArray& b);

// Linear algebra
std::unique_ptr<MlxArray> tensordot(const MlxArray& a, const MlxArray& b, int32_t axes);
std::unique_ptr<MlxArray> inner(const MlxArray& a, const MlxArray& b);
std::unique_ptr<MlxArray> outer(const MlxArray& a, const MlxArray& b);
std::unique_ptr<MlxArray> trace(const MlxArray& a, int32_t offset, int32_t axis1, int32_t axis2);

// Roll (circular shift)
std::unique_ptr<MlxArray> roll(const MlxArray& a, int32_t shift, int32_t axis);

// Nan handling
std::unique_ptr<MlxArray> nan_to_num(const MlxArray& a, float nan_val, float posinf_val, float neginf_val);

// Stop gradient
std::unique_ptr<MlxArray> stop_gradient(const MlxArray& a);

// 2D convolution
std::unique_ptr<MlxArray> conv2d(
    const MlxArray& input,
    const MlxArray& weight,
    int32_t stride_h, int32_t stride_w,
    int32_t padding_h, int32_t padding_w,
    int32_t dilation_h, int32_t dilation_w,
    int32_t groups
);

// 2D average pooling
// Used by: VisionModule (Gemma3 AvgPool projector)
std::unique_ptr<MlxArray> avg_pool2d(
    const MlxArray& input,
    int32_t kernel_h, int32_t kernel_w,
    int32_t stride_h, int32_t stride_w,
    int32_t padding_h, int32_t padding_w
);

// MoE (Mixture of Experts) operations.
// Gather matrix multiply for MoE
// sorted_indices: if true, lhs_indices are pre-sorted for better memory access
std::unique_ptr<MlxArray> gather_mm(
    const MlxArray& a,
    const MlxArray& b,
    const MlxArray* lhs_indices,    // nullable
    const MlxArray* rhs_indices,    // nullable
    bool sorted_indices
);

// Gather quantized matrix multiply for MoE
// sorted_indices: if true, lhs_indices are pre-sorted for better memory access
std::unique_ptr<MlxArray> gather_qmm(
    const MlxArray& x,
    const MlxArray& w,
    const MlxArray& scales,
    const MlxArray* biases,         // nullable for no-bias quantization
    const MlxArray* lhs_indices,    // nullable
    const MlxArray* rhs_indices,    // nullable
    bool transpose,
    int32_t group_size,
    int32_t bits,
    bool sorted_indices,
    rust::Str mode
);

// Direct quantized matrix multiplication
// y = x @ dequantize(w, scales, biases).T if transpose else x @ dequantize(w, scales, biases)
std::unique_ptr<MlxArray> quantized_matmul(
    const MlxArray& x,
    const MlxArray& w,
    const MlxArray& scales,
    const MlxArray* biases,         // nullable for no-bias quantization
    bool transpose,
    int32_t group_size,
    int32_t bits,
    rust::Str mode
);

// Dequantize quantized weights
// Returns full-precision weights from quantized representation
std::unique_ptr<MlxArray> dequantize(
    const MlxArray& w,
    const MlxArray& scales,
    const MlxArray* biases,     // nullable for mxfp4/nvfp4/mxfp8
    int32_t group_size,
    int32_t bits,
    rust::Str mode
);

// Embedding.
std::unique_ptr<MlxArray> embedding(const MlxArray& weight, const MlxArray& indices);

// Quantized embedding lookup with dequantization
std::unique_ptr<MlxArray> quantized_embedding(
    const MlxArray& weight,
    const MlxArray& scales,
    const MlxArray* biases,     // nullable for mxfp4/nvfp4/mxfp8
    const MlxArray& indices,
    int32_t group_size,
    int32_t bits,
    rust::Str mode
);

// Fast operations (using MLX fast kernels).
// Fast RoPE using MLX fast kernel
std::unique_ptr<MlxArray> fast_rope(
    const MlxArray& x,
    int32_t dims,
    bool traditional,
    float base,
    float scale,
    int32_t offset
);

// Fast RoPE with custom frequencies (for Yarn RoPE)
std::unique_ptr<MlxArray> fast_rope_with_freqs(
    const MlxArray& x,
    int32_t dims,
    bool traditional,
    float scale,
    int32_t offset,
    const MlxArray& freqs
);

// Compiled ProportionalRoPE (Gemma 4 full-attention layers). Wraps the
// mlx-lm full-head `fast::rope` call with an `inf` frequency tail in one
// `mx::core::compile` window. Requires `rotated_dims > 0` and
// `last_dim == head_dim`; the rare `last_dim > head_dim` tail case must
// stay on the op-at-a-time path. `offset` flows through as a scalar array
// input so the same compiled graph serves every decode step.
std::unique_ptr<MlxArray> compiled_proportional_rope(
    const MlxArray& x,
    const MlxArray& freqs,
    int32_t head_dim,
    int32_t rotated_dims,
    int32_t offset
);

// Compiled Gemma 4 Q-path with proportional RoPE. Folds
// `reshape → fast::rms_norm → transpose → full-head ProportionalRoPE`
// into one compile window so MLX sees a single fused subgraph instead of
// four cxx-bridge calls. Used on Gemma 4 full-attention layers only.
std::unique_ptr<MlxArray> compiled_q_path_proportional(
    const MlxArray& q_proj_out,
    const MlxArray& q_norm_weight,
    const MlxArray& freqs,
    float rms_eps,
    int32_t n_heads,
    int32_t head_dim,
    int32_t rotated_dims,
    int32_t offset
);

// Compiled Gemma 4 per-layer-input-gate chain (e2b / e4b variants).
// Fuses `gate_proj → gelu_approx → multiply(per_layer) → proj →
// post_norm → add(after_ffn)` into one compile window. Requires
// affine / gs=64 / bits=4 with biases present; other modes fall
// through to an op-at-a-time fallback.
std::unique_ptr<MlxArray> compiled_per_layer_input_gate(
    const MlxArray& after_ffn,
    const MlxArray& per_layer_input,
    const MlxArray& gate_w,
    const MlxArray& gate_s,
    const MlxArray* gate_b,
    const MlxArray& proj_w,
    const MlxArray& proj_s,
    const MlxArray* proj_b,
    const MlxArray& post_norm_w,
    float post_norm_eps,
    int32_t group_size,
    int32_t bits,
    rust::Str mode
);

// Fast RMS norm using MLX fast kernel
std::unique_ptr<MlxArray> fast_rms_norm(
    const MlxArray& x,
    const MlxArray& weight,
    float eps
);

// Fast RMS norm without a learnable scale
std::unique_ptr<MlxArray> fast_rms_norm_no_weight(
    const MlxArray& x,
    float eps
);

// Fast layer norm using MLX fast kernel
std::unique_ptr<MlxArray> fast_layer_norm(
    const MlxArray& x,
    const MlxArray* weight,  // nullable
    const MlxArray* bias,    // nullable
    float eps
);

// Fast scaled dot product attention using MLX fast kernel
std::unique_ptr<MlxArray> fast_scaled_dot_product_attention(
    const MlxArray& q,
    const MlxArray& k,
    const MlxArray& v,
    float scale,
    const MlxArray* mask  // nullable
);

// Fast SDPA with optional sinks (per-head attention bias for first position)
// Used by: GptOss
std::unique_ptr<MlxArray> fast_scaled_dot_product_attention_with_sinks(
    const MlxArray& q,
    const MlxArray& k,
    const MlxArray& v,
    float scale,
    const MlxArray* mask,
    const MlxArray* sinks
);

// SDPA with explicit causal masking for prefill (no mask array needed)
std::unique_ptr<MlxArray> fast_scaled_dot_product_attention_causal(
    const MlxArray& q,
    const MlxArray& k,
    const MlxArray& v,
    float scale
);

// Decode-only paged attention over dense compatibility KV caches.
std::unique_ptr<MlxArray> paged_decode_attention_dense_compat(
    const MlxArray& q,
    rust::Slice<const MlxArray* const> cache_keys,
    rust::Slice<const MlxArray* const> cache_values,
    rust::Slice<const int32_t> kv_lens,
    rust::Slice<const int32_t> block_tables,
    rust::Slice<const int32_t> block_table_offsets,
    int32_t block_size,
    float scale
);

// Decode-only paged attention over rotating ring-buffer KV caches.
std::unique_ptr<MlxArray> paged_decode_attention_rotating_compat(
    const MlxArray& q,
    rust::Slice<const MlxArray* const> cache_keys,
    rust::Slice<const MlxArray* const> cache_values,
    rust::Slice<const int32_t> kv_lens,
    rust::Slice<const int32_t> logical_starts,
    int32_t block_size,
    float scale
);

// Upstream MLX SDPA capability helpers for Metal/NAX instrumentation.
bool sdpa_supports_fast_path(
    const MlxArray& q,
    const MlxArray& k,
    const MlxArray& v,
    bool has_mask,
    bool has_arr_mask,
    bool do_causal
);

bool sdpa_supports_nax(
    const MlxArray& q,
    const MlxArray& k,
    const MlxArray& v,
    bool has_mask,
    bool has_arr_mask,
    bool do_causal
);

// Fused QKV projection + reshape + transpose + RoPE
// Reduces FFI overhead for the projection chain
std::unique_ptr<MlxArray> fused_qkv_project_and_rope(
    const MlxArray& x,
    const MlxArray& weight,
    const MlxArray& scales,
    const MlxArray* biases,     // nullable for mxfp4/nvfp4/mxfp8
    int32_t num_heads,
    int32_t head_dim,
    int32_t rope_dims,
    float rope_base,
    int32_t cache_offset,
    int32_t group_size,
    int32_t bits,
    bool apply_rope,
    rust::Str mode
);

// Fused concatenated QKV projection + split + reshape + transpose + RoPE.
// Used by: Llama3-family fused attention preparation path.
void fused_qkv_project_split_rope(
    const MlxArray& x,
    const MlxArray& weight,
    const MlxArray& scales,
    const MlxArray* biases,     // nullable for mxfp4/nvfp4/mxfp8
    int32_t num_heads,
    int32_t num_kv_heads,
    int32_t head_dim,
    int32_t rope_dims,
    float rope_base,
    int32_t cache_offset,
    int32_t group_size,
    int32_t bits,
    rust::Str mode,
    std::unique_ptr<MlxArray>& q_out,
    std::unique_ptr<MlxArray>& k_out,
    std::unique_ptr<MlxArray>& v_out
);

// Experimental dense causal prefill attention path:
// qkv projection + split + rope + native causal SDPA + output projection.
// Returns output plus K/V tensors so Rust can populate the KV cache.
void fused_causal_prefill_attention(
    const MlxArray& x,
    const MlxArray& qkv_weight,
    const MlxArray& qkv_scales,
    const MlxArray* qkv_biases,
    const MlxArray& o_weight,
    const MlxArray& o_scales,
    const MlxArray* o_biases,
    int32_t num_heads,
    int32_t num_kv_heads,
    int32_t head_dim,
    int32_t rope_dims,
    float rope_base,
    float scale,
    int32_t group_size,
    int32_t bits,
    rust::Str mode,
    std::unique_ptr<MlxArray>& output_out,
    std::unique_ptr<MlxArray>& k_out,
    std::unique_ptr<MlxArray>& v_out
);

// Compiled operations (with kernel fusion).
// Compiled full MoE expert forward
// Compiles: silu(gate_proj(x)) * up_proj(x), then down_proj
// Note: compiled path only supports affine mode; non-affine modes fall back to non-compiled
std::unique_ptr<MlxArray> compiled_moe_expert_forward(
    const MlxArray& x,
    const MlxArray& gate_proj,
    const MlxArray& gate_scales,
    const MlxArray* gate_biases,    // nullable for mxfp4/nvfp4/mxfp8
    const MlxArray& up_proj,
    const MlxArray& up_scales,
    const MlxArray* up_biases,      // nullable for mxfp4/nvfp4/mxfp8
    const MlxArray& down_proj,
    const MlxArray& down_scales,
    const MlxArray* down_biases,    // nullable for mxfp4/nvfp4/mxfp8
    int32_t group_size,
    int32_t bits,
    rust::Str mode
);