mlxcel/Cargo.toml at main · lablup/mlxcel · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
[workspace]
members = [".", "src/lib/mlxcel-core", "src/lib/mlxcel-surgery"]
resolver = "2"

[package]
name = "mlxcel"
version = "0.0.27"
edition = "2024"
description = "High-performance LLM/VLM/VLA inference on Apple Silicon and CUDA GPUs"
repository = "https://github.com/lablup/mlxcel"
license = "Apache-2.0"
keywords = ["mlx", "llm", "vlm", "inference"]
categories = ["science", "command-line-utilities"]

[[bin]]
name = "mlxcel"
path = "src/main.rs"

[[bin]]
name = "mlxcel-server"
path = "src/bin/mlx_server.rs"

# Perf benchmark harness for the speculative drafter pairings shipped by
# epic #633 / issue #632. Captures no-drafter baseline tok/s and scaffolds
# the speculative numerator rows that follow-up #666 will fill in. See
# `src/bin/speculative_bench.rs` for the module docs and `docs/model_tests.md`
# (`Speculative drafters (epic #633)`) for the deferral matrix.
[[bin]]
name = "speculative_bench"
path = "src/bin/speculative_bench.rs"

[features]
default = ["surgery"]
metal = ["mlxcel-core/metal"]
accelerate = ["mlxcel-core/accelerate"]
cuda = ["mlxcel-core/cuda"]
test-utils = []
# Axis A "weight-load surgery" framework (Epic #363, issues #367 / #369 / #371).
#
# On by default so production `mlxcel` and `mlxcel-server` binaries expose
# the `--surgery <config.yaml>` flag without rebuilds. Bit-exactness with
# the pre-surgery baseline is preserved without conditional features: when
# the flag is not supplied, the active-pipeline slot in `crate::surgery`
# stays `None`, the consolidated loaders call `transform.apply` zero times,
# and the load path matches `transform = None` byte-for-byte (verified by
# `crate::models::sanitize_tests::load_text_weights_with_none_transform_matches_legacy_path`).
#
# Build with `--no-default-features` to compile without the surgery crate
# entirely (e.g. CI parity tests against pre-A1 behaviour or constrained
# embedded targets).
surgery = ["dep:mlxcel-surgery"]

[dependencies]
# MLX - mlxcel-core (direct C++ bindings via cxx)
mlxcel-core = { path = "src/lib/mlxcel-core", default-features = false }

# Axis A weight-load surgery — opt-in via the `surgery` feature above.
mlxcel-surgery = { path = "src/lib/mlxcel-surgery", optional = true }

# Serialization
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
serde_yaml = "0.9"

# Tensors & Models
safetensors = "0.7.0"
tokenizers = "0.22.2"
sentencepiece = "0.13"
hf-hub = "0.5.0"

# Zero-copy buffers
bytes = "1.9"
memmap2 = "0.9"

# LZ4 compression (pure Rust, used for tensor transfer)
lz4_flex = "0.11"

# Error Handling
anyhow = "1.0.100"
thiserror = "2.0.11"

# Cryptographic hashing (for vision feature cache keys)
sha2 = "0.10"

# Fast non-cryptographic-grade hashing for prompt prefix cache keys (issue #419).
# BLAKE3 is used only for in-process cache-key digests; no security boundary
# depends on it.
blake3 = "1.8"

# CLI
clap = { version = "4.6.1", features = ["derive", "env"] }

# Image processing (for VLM)
image = { version = "0.25", default-features = false, features = ["png", "jpeg", "webp"] }

# Base64 (for VLM image data URIs in server API)
base64 = "0.22"

# Utils
glob = "0.3.2"
path-clean = "1.0.1"
minijinja = { version = "2.5", features = ["loop_controls"] }
# Used by `src/downloader/mod.rs::file_url` to percent-encode `repo_id` /
# `revision` / `filename` path segments before composing the HF download URL
# (issue #650, L1). Already transitively present via reqwest -> url; promoting
# to a direct dep so the downloader can call into it without depending on
# reqwest's transitive re-export.
percent-encoding = "2"

# Structured outputs / constrained decoding (issue #550).
#
# `llguidance` is the same library used by mlx-vlm upstream PR #1047 for the
# JSON-schema response_format MVP. The Rust crate exposes `Matcher` /
# `ParserFactory` directly so we don't need to round-trip through Python.
# `toktrie_hf_tokenizers` adapts our existing HuggingFace `tokenizers::Tokenizer`
# into the byte-level `TokEnv` that llguidance expects.
#
# `default-features = false, features = ["lark", "referencing"]` drops the
# optional `rayon` thread pool and the `ahash` non-deterministic hasher; we
# keep `lark` (string-form grammar parsing — used by `from_tagged_str`) and
# `referencing` (JSON-schema $ref resolution required for nested schemas).
llguidance = { version = "1.7", default-features = false, features = ["lark", "referencing"] }
toktrie_hf_tokenizers = "1.7"

# HTTP Server
tokio = { version = "1.52", features = ["full"] }
axum = { version = "0.7", features = ["json", "macros"] }
tokio-stream = "0.1"
tokio-util = "0.7"
futures = "0.3"
uuid = { version = "1.23", features = ["v4"] }
chrono = "0.4"
tracing = "0.1"
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
tower-http = { version = "0.5", features = ["cors", "trace"] }
tower = { version = "0.4", features = ["util"] }
hyper = { version = "1.1", features = ["server"] }
hyper-util = { version = "0.1", features = ["tokio", "server-auto"] }
reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls", "stream"] }
indicatif = "0.18"
async-stream = "0.3"
fancy-regex = "0.17.0"
toml = "0.8"

[target.'cfg(unix)'.dependencies]
# Used for the RDMA-aware transport backend capability probe (io_uring on
# Linux, kqueue / writev batched sends on macOS). Unix-only: this does not
# affect Windows builds of the library today.
libc = "0.2"

[profile.release]
strip = true              # Strip all symbols for smaller binary
lto = true                # Full (fat) LTO for best size and performance
codegen-units = 1         # Single codegen unit for maximum optimization
opt-level = 3             # Maximum speed optimization (inference is compute-bound)
panic = "abort"           # Remove panic unwinding code

[lints.clippy]
upper_case_acronyms = "allow"
too_many_arguments = "allow"
type_complexity = "allow"
large_enum_variant = "allow"
should_implement_trait = "allow"

[dev-dependencies]
tempfile = "3"

[[test]]
name = "distributed_integration"
required-features = ["test-utils"]

[[test]]
name = "pipeline_e2e"
required-features = ["test-utils"]