diff --git a/Cargo.lock b/Cargo.lock index 7b574edc..3b74d6a5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -147,6 +147,15 @@ dependencies = [ "syn", ] +[[package]] +name = "atomic-polyfill" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8cf2bce30dfe09ef0bfaef228b9d414faaf7e563035494d7fe092dba54b300f4" +dependencies = [ + "critical-section", +] + [[package]] name = "atomic-waker" version = "1.1.2" @@ -244,15 +253,6 @@ version = "1.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2af50177e190e07a26ab74f8b1efbfe2ef87da2116221318cb1c2e82baf7de06" -[[package]] -name = "bincode" -version = "1.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" -dependencies = [ - "serde", -] - [[package]] name = "bit-set" version = "0.8.0" @@ -456,6 +456,15 @@ dependencies = [ "cc", ] +[[package]] +name = "cobs" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fa961b519f0b462e3a3b4a34b64d119eeaca1d59af726fe450bbba07a9fc0a1" +dependencies = [ + "thiserror", +] + [[package]] name = "codespan-reporting" version = "0.13.1" @@ -594,6 +603,12 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "critical-section" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "790eea4361631c5e7d22598ecd5723ff611904e3344ce8720784c93e3d83d40b" + [[package]] name = "crossbeam-deque" version = "0.8.6" @@ -876,6 +891,18 @@ version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" +[[package]] +name = "embedded-io" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef1a6892d9eef45c8fa6b9e0086428a2cca8491aca8f787c534a3d6d0bcb3ced" + +[[package]] +name = "embedded-io" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edd0f118536f44f5ccd48bcb8b111bdc3de888b58c74639dfb034a357d0f206d" + [[package]] name = "encode_unicode" version = "1.0.0" @@ -1198,6 +1225,15 @@ dependencies = [ "tracing", ] +[[package]] +name = "hash32" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0c35f58762feb77d74ebe43bdbc3210f09be9fe6742234d573bacc26ed92b67" +dependencies = [ + "byteorder", +] + [[package]] name = "hashbrown" version = "0.15.5" @@ -1220,6 +1256,20 @@ dependencies = [ "serde_core", ] +[[package]] +name = "heapless" +version = "0.7.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdc6457c0eb62c71aac4bc17216026d8410337c4126773b9c5daba343f17964f" +dependencies = [ + "atomic-polyfill", + "hash32", + "rustc_version", + "serde", + "spin", + "stable_deref_trait", +] + [[package]] name = "heck" version = "0.5.0" @@ -1885,12 +1935,12 @@ dependencies = [ name = "mlxcel-core" version = "0.1.0" dependencies = [ - "bincode", "cmake", "cxx", "cxx-build", "dirs", "libm", + "postcard", "serde", "serde_json", "sha2 0.11.0", @@ -2207,6 +2257,19 @@ version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" +[[package]] +name = "postcard" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6764c3b5dd454e283a30e6dfe78e9b31096d9e32036b5d1eaac7a6119ccb9a24" +dependencies = [ + "cobs", + "embedded-io 0.4.0", + "embedded-io 0.6.1", + "heapless", + "serde", +] + [[package]] name = "potential_utf" version = "0.1.5" @@ -2571,6 +2634,15 @@ version = "2.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe" +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + [[package]] name = "rustix" version = "1.1.4" @@ -2897,6 +2969,15 @@ dependencies = [ "winapi", ] +[[package]] +name = "spin" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" +dependencies = [ + "lock_api", +] + [[package]] name = "spm_precompiled" version = "0.1.4" diff --git a/deny.toml b/deny.toml index 1ff1307d..c01b6f3c 100644 --- a/deny.toml +++ b/deny.toml @@ -18,13 +18,6 @@ no-default-features = false [advisories] db-path = "~/.cargo/advisory-db" ignore = [ - # `bincode 1.3.3` — used by mlxcel-core. The bincode 1.x team has permanently - # ceased development (RUSTSEC-2025-0141). No safe upgrade exists; bincode 2.x - # is a different crate with a breaking API. Migration to an alternative - # (postcard / bitcode / rkyv) is tracked separately and is not a runtime - # security issue — the advisory is "unmaintained", not a vulnerability. - { id = "RUSTSEC-2025-0141", reason = "bincode 1.x permanently unmaintained; migration tracked in a follow-up issue; no runtime vulnerability" }, - # `paste 1.0.15` — transitive via the `tokenizers` crate (HF). The author # archived the project (RUSTSEC-2024-0436). `pastey` is a drop-in fork but # adoption depends on upstream `tokenizers` switching. No runtime vulnerability diff --git a/src/lib/mlxcel-core/Cargo.toml b/src/lib/mlxcel-core/Cargo.toml index 0b816833..3d00107e 100644 --- a/src/lib/mlxcel-core/Cargo.toml +++ b/src/lib/mlxcel-core/Cargo.toml @@ -23,7 +23,7 @@ smallvec = { version = "1.13", features = ["serde"] } sha2 = "0.11" thiserror = "2.0" tokenizers = "0.22.2" -bincode = "1" +postcard = { version = "1", features = ["alloc"] } dirs = "6" libm = "0.2" tracing = "0.1" diff --git a/src/lib/mlxcel-core/src/lang_analyzer/cache.rs b/src/lib/mlxcel-core/src/lang_analyzer/cache.rs index 8221acc5..ba06a7ba 100644 --- a/src/lib/mlxcel-core/src/lang_analyzer/cache.rs +++ b/src/lib/mlxcel-core/src/lang_analyzer/cache.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -//! Disk cache for `TokenLanguageIndex` (B4 — vocab-hash keyed, bincode v1). +//! Disk cache for `TokenLanguageIndex` (B4 — vocab-hash keyed, postcard 1.x). //! //! # Cache key //! `vocab_hash = hex(sha256(tokenizer.json bytes))[..16]` @@ -24,7 +24,7 @@ //! - File missing → build and write. //! - `version` field mismatch → rebuild and overwrite. //! - `--lang-bias-rebuild-cache` / `rebuild: bool` → force rebuild. -//! - Corrupted bincode → rename to `*.broken..bak` then rebuild. +//! - Corrupted postcard data → rename to `*.broken..bak` then rebuild. use std::path::PathBuf; @@ -69,7 +69,7 @@ pub fn cache_path(vocab_hash: &str) -> PathBuf { /// On a version mismatch the corrupted/stale file is left in place (the /// caller will overwrite it via [`save`]). /// -/// On a **bincode decode failure** the corrupt file is renamed to +/// On a **postcard decode failure** the corrupt file is renamed to /// `.broken..bak` before returning `None`, so the /// caller can build fresh without worrying about re-encountering the same /// corrupt bytes. @@ -77,7 +77,7 @@ pub fn try_load(vocab_hash: &str) -> Option { let path = cache_path(vocab_hash); let bytes = std::fs::read(&path).ok()?; - match bincode::deserialize::(&bytes) { + match postcard::from_bytes::(&bytes) { Ok(idx) if idx.version == CURRENT_VERSION => Some(idx), Ok(_) => { // Version mismatch — stale cache. Leave the file; the caller will @@ -121,7 +121,7 @@ pub fn save(index: &TokenLanguageIndex) -> Result<(), LangAnalyzerError> { if let Some(parent) = path.parent() { std::fs::create_dir_all(parent)?; } - let bytes = bincode::serialize(index)?; + let bytes = postcard::to_allocvec(index)?; // Write to a sibling temp file first to ensure atomicity. let tmp = path.with_extension("bin.tmp"); std::fs::write(&tmp, &bytes)?; @@ -364,7 +364,7 @@ mod tests { std::env::set_var("MLXCEL_CACHE_DIR", tmp.path()); let path = cache_path(hash); std::fs::create_dir_all(path.parent().unwrap()).expect("create dirs"); - std::fs::write(&path, b"not valid bincode data!!!").expect("write garbage"); + std::fs::write(&path, b"not valid postcard data!!!").expect("write garbage"); let result = try_load(hash); let path_still_exists = path.exists(); let cache_dir = path.parent().unwrap().to_path_buf(); diff --git a/src/lib/mlxcel-core/src/lang_analyzer/mod.rs b/src/lib/mlxcel-core/src/lang_analyzer/mod.rs index 4dfdd5aa..21ede340 100644 --- a/src/lib/mlxcel-core/src/lang_analyzer/mod.rs +++ b/src/lib/mlxcel-core/src/lang_analyzer/mod.rs @@ -22,7 +22,7 @@ //! The module is structured in sub-issues: //! - **B2 (this file, initial)**: `Script` enum, `classify_token`, helper predicates. //! - **B3** (added in the same file): `TokenScriptInfo`, `TokenLanguageIndex`, `build()`. -//! - **B4** (`cache` submodule): disk cache for `TokenLanguageIndex` (vocab-hash keyed, bincode v1). +//! - **B4** (`cache` submodule): disk cache for `TokenLanguageIndex` (vocab-hash keyed, postcard 1.x). pub mod cache; pub use cache::{cache_path, load_or_build, save, try_load}; @@ -284,8 +284,8 @@ pub enum LangAnalyzerError { Io(#[from] std::io::Error), #[error("tokenizer.json not found at path: {0}")] TokenizerJsonNotFound(String), - #[error("bincode serialization error: {0}")] - Bincode(#[from] bincode::Error), + #[error("postcard serialization error: {0}")] + Postcard(#[from] postcard::Error), #[error("unknown language code '{0}'; expected one of: ja zh ko en ru ar th hi he el")] UnknownLanguageCode(String), }