Skip to content

Commit a84284e

Browse files
authored
refactor(core): migrate lang_analyzer cache from bincode to postcard (#16)
Closes #8. Migrates `mlxcel-core`'s `TokenLanguageIndex` on-disk cache off the unmaintained `bincode` 1.x (RUSTSEC-2025-0141 — project permanently ceased development) to `postcard` 1.x. ## Architectural choice — postcard - Drop-in serde-based API (`postcard::to_allocvec(&v)?` / `postcard::from_bytes::<T>(&bytes)?`) — minimal call-site change. - 1.0+ stable with frozen wire format — protects production users' cache compatibility across future dependency upgrades. - Embedded Rust ecosystem standard — natural fit for an inference-runtime cache. Alternatives considered: `bitcode` (faster but pre-1.0, wire-format changes between minors), `rkyv` (zero-copy over-engineering for a startup-time cache), `bincode` 2.x / 3.x (different maintainer team, name-confusion risk — Dependabot's auto-PR #10 to bincode 3.x was explicitly closed in favor of this approach). ## Cache compatibility — graceful degradation `lang_analyzer/cache.rs` already has corrupt-detection: a deserialization failure renames the existing bincode-format file to `*.broken.<epoch>.bak` and rebuilds the cache from source. Users upgrading from a prior mlxcel release will trip this path automatically on the first cache read — no explicit migration code needed. ## Scope (3 files + Cargo.lock) - `src/lib/mlxcel-core/Cargo.toml` — `bincode = "1"` → `postcard = { version = "1", features = ["alloc"] }` - `src/lib/mlxcel-core/src/lang_analyzer/cache.rs` — 3 call sites + doc comments - `src/lib/mlxcel-core/src/lang_analyzer/mod.rs` — error variant `Bincode` → `Postcard` (no external match-arms on it, safe rename) - `deny.toml` — RUSTSEC-2025-0141 `[advisories.ignore]` entry removed (no longer applicable) Verified locally: `cargo deny check` clean (advisories ok, bans ok, licenses ok, sources ok); `cargo tree -p mlxcel-core -i bincode` returns "package not found" — bincode is fully removed from the dependency tree. Self-hosted clippy + test gate is queued behind the PR #14 cold-build backlog at merge time; will run on main after the runner clears.
1 parent db35c68 commit a84284e

5 files changed

Lines changed: 101 additions & 27 deletions

File tree

Cargo.lock

Lines changed: 91 additions & 10 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

deny.toml

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,6 @@ no-default-features = false
1818
[advisories]
1919
db-path = "~/.cargo/advisory-db"
2020
ignore = [
21-
# `bincode 1.3.3` — used by mlxcel-core. The bincode 1.x team has permanently
22-
# ceased development (RUSTSEC-2025-0141). No safe upgrade exists; bincode 2.x
23-
# is a different crate with a breaking API. Migration to an alternative
24-
# (postcard / bitcode / rkyv) is tracked separately and is not a runtime
25-
# security issue — the advisory is "unmaintained", not a vulnerability.
26-
{ id = "RUSTSEC-2025-0141", reason = "bincode 1.x permanently unmaintained; migration tracked in a follow-up issue; no runtime vulnerability" },
27-
2821
# `paste 1.0.15` — transitive via the `tokenizers` crate (HF). The author
2922
# archived the project (RUSTSEC-2024-0436). `pastey` is a drop-in fork but
3023
# adoption depends on upstream `tokenizers` switching. No runtime vulnerability

src/lib/mlxcel-core/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ smallvec = { version = "1.13", features = ["serde"] }
2323
sha2 = "0.11"
2424
thiserror = "2.0"
2525
tokenizers = "0.22.2"
26-
bincode = "1"
26+
postcard = { version = "1", features = ["alloc"] }
2727
dirs = "6"
2828
libm = "0.2"
2929
tracing = "0.1"

src/lib/mlxcel-core/src/lang_analyzer/cache.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414

15-
//! Disk cache for `TokenLanguageIndex` (B4 — vocab-hash keyed, bincode v1).
15+
//! Disk cache for `TokenLanguageIndex` (B4 — vocab-hash keyed, postcard 1.x).
1616
//!
1717
//! # Cache key
1818
//! `vocab_hash = hex(sha256(tokenizer.json bytes))[..16]`
@@ -24,7 +24,7 @@
2424
//! - File missing → build and write.
2525
//! - `version` field mismatch → rebuild and overwrite.
2626
//! - `--lang-bias-rebuild-cache` / `rebuild: bool` → force rebuild.
27-
//! - Corrupted bincode → rename to `*.broken.<epoch>.bak` then rebuild.
27+
//! - Corrupted postcard data → rename to `*.broken.<epoch>.bak` then rebuild.
2828
2929
use std::path::PathBuf;
3030

@@ -69,15 +69,15 @@ pub fn cache_path(vocab_hash: &str) -> PathBuf {
6969
/// On a version mismatch the corrupted/stale file is left in place (the
7070
/// caller will overwrite it via [`save`]).
7171
///
72-
/// On a **bincode decode failure** the corrupt file is renamed to
72+
/// On a **postcard decode failure** the corrupt file is renamed to
7373
/// `<original>.broken.<epoch_secs>.bak` before returning `None`, so the
7474
/// caller can build fresh without worrying about re-encountering the same
7575
/// corrupt bytes.
7676
pub fn try_load(vocab_hash: &str) -> Option<TokenLanguageIndex> {
7777
let path = cache_path(vocab_hash);
7878
let bytes = std::fs::read(&path).ok()?;
7979

80-
match bincode::deserialize::<TokenLanguageIndex>(&bytes) {
80+
match postcard::from_bytes::<TokenLanguageIndex>(&bytes) {
8181
Ok(idx) if idx.version == CURRENT_VERSION => Some(idx),
8282
Ok(_) => {
8383
// Version mismatch — stale cache. Leave the file; the caller will
@@ -121,7 +121,7 @@ pub fn save(index: &TokenLanguageIndex) -> Result<(), LangAnalyzerError> {
121121
if let Some(parent) = path.parent() {
122122
std::fs::create_dir_all(parent)?;
123123
}
124-
let bytes = bincode::serialize(index)?;
124+
let bytes = postcard::to_allocvec(index)?;
125125
// Write to a sibling temp file first to ensure atomicity.
126126
let tmp = path.with_extension("bin.tmp");
127127
std::fs::write(&tmp, &bytes)?;
@@ -364,7 +364,7 @@ mod tests {
364364
std::env::set_var("MLXCEL_CACHE_DIR", tmp.path());
365365
let path = cache_path(hash);
366366
std::fs::create_dir_all(path.parent().unwrap()).expect("create dirs");
367-
std::fs::write(&path, b"not valid bincode data!!!").expect("write garbage");
367+
std::fs::write(&path, b"not valid postcard data!!!").expect("write garbage");
368368
let result = try_load(hash);
369369
let path_still_exists = path.exists();
370370
let cache_dir = path.parent().unwrap().to_path_buf();

src/lib/mlxcel-core/src/lang_analyzer/mod.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
//! The module is structured in sub-issues:
2323
//! - **B2 (this file, initial)**: `Script` enum, `classify_token`, helper predicates.
2424
//! - **B3** (added in the same file): `TokenScriptInfo`, `TokenLanguageIndex`, `build()`.
25-
//! - **B4** (`cache` submodule): disk cache for `TokenLanguageIndex` (vocab-hash keyed, bincode v1).
25+
//! - **B4** (`cache` submodule): disk cache for `TokenLanguageIndex` (vocab-hash keyed, postcard 1.x).
2626
2727
pub mod cache;
2828
pub use cache::{cache_path, load_or_build, save, try_load};
@@ -284,8 +284,8 @@ pub enum LangAnalyzerError {
284284
Io(#[from] std::io::Error),
285285
#[error("tokenizer.json not found at path: {0}")]
286286
TokenizerJsonNotFound(String),
287-
#[error("bincode serialization error: {0}")]
288-
Bincode(#[from] bincode::Error),
287+
#[error("postcard serialization error: {0}")]
288+
Postcard(#[from] postcard::Error),
289289
#[error("unknown language code '{0}'; expected one of: ja zh ko en ru ar th hi he el")]
290290
UnknownLanguageCode(String),
291291
}

0 commit comments

Comments
 (0)