Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
101 changes: 91 additions & 10 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 0 additions & 7 deletions deny.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,6 @@ no-default-features = false
[advisories]
db-path = "~/.cargo/advisory-db"
ignore = [
# `bincode 1.3.3` — used by mlxcel-core. The bincode 1.x team has permanently
# ceased development (RUSTSEC-2025-0141). No safe upgrade exists; bincode 2.x
# is a different crate with a breaking API. Migration to an alternative
# (postcard / bitcode / rkyv) is tracked separately and is not a runtime
# security issue — the advisory is "unmaintained", not a vulnerability.
{ id = "RUSTSEC-2025-0141", reason = "bincode 1.x permanently unmaintained; migration tracked in a follow-up issue; no runtime vulnerability" },

# `paste 1.0.15` — transitive via the `tokenizers` crate (HF). The author
# archived the project (RUSTSEC-2024-0436). `pastey` is a drop-in fork but
# adoption depends on upstream `tokenizers` switching. No runtime vulnerability
Expand Down
2 changes: 1 addition & 1 deletion src/lib/mlxcel-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ smallvec = { version = "1.13", features = ["serde"] }
sha2 = "0.11"
thiserror = "2.0"
tokenizers = "0.22.2"
bincode = "1"
postcard = { version = "1", features = ["alloc"] }
dirs = "6"
libm = "0.2"
tracing = "0.1"
Expand Down
12 changes: 6 additions & 6 deletions src/lib/mlxcel-core/src/lang_analyzer/cache.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.

//! Disk cache for `TokenLanguageIndex` (B4 — vocab-hash keyed, bincode v1).
//! Disk cache for `TokenLanguageIndex` (B4 — vocab-hash keyed, postcard 1.x).
//!
//! # Cache key
//! `vocab_hash = hex(sha256(tokenizer.json bytes))[..16]`
Expand All @@ -24,7 +24,7 @@
//! - File missing → build and write.
//! - `version` field mismatch → rebuild and overwrite.
//! - `--lang-bias-rebuild-cache` / `rebuild: bool` → force rebuild.
//! - Corrupted bincode → rename to `*.broken.<epoch>.bak` then rebuild.
//! - Corrupted postcard data → rename to `*.broken.<epoch>.bak` then rebuild.

use std::path::PathBuf;

Expand Down Expand Up @@ -69,15 +69,15 @@ pub fn cache_path(vocab_hash: &str) -> PathBuf {
/// On a version mismatch the corrupted/stale file is left in place (the
/// caller will overwrite it via [`save`]).
///
/// On a **bincode decode failure** the corrupt file is renamed to
/// On a **postcard decode failure** the corrupt file is renamed to
/// `<original>.broken.<epoch_secs>.bak` before returning `None`, so the
/// caller can build fresh without worrying about re-encountering the same
/// corrupt bytes.
pub fn try_load(vocab_hash: &str) -> Option<TokenLanguageIndex> {
let path = cache_path(vocab_hash);
let bytes = std::fs::read(&path).ok()?;

match bincode::deserialize::<TokenLanguageIndex>(&bytes) {
match postcard::from_bytes::<TokenLanguageIndex>(&bytes) {
Ok(idx) if idx.version == CURRENT_VERSION => Some(idx),
Ok(_) => {
// Version mismatch — stale cache. Leave the file; the caller will
Expand Down Expand Up @@ -121,7 +121,7 @@ pub fn save(index: &TokenLanguageIndex) -> Result<(), LangAnalyzerError> {
if let Some(parent) = path.parent() {
std::fs::create_dir_all(parent)?;
}
let bytes = bincode::serialize(index)?;
let bytes = postcard::to_allocvec(index)?;
// Write to a sibling temp file first to ensure atomicity.
let tmp = path.with_extension("bin.tmp");
std::fs::write(&tmp, &bytes)?;
Expand Down Expand Up @@ -364,7 +364,7 @@ mod tests {
std::env::set_var("MLXCEL_CACHE_DIR", tmp.path());
let path = cache_path(hash);
std::fs::create_dir_all(path.parent().unwrap()).expect("create dirs");
std::fs::write(&path, b"not valid bincode data!!!").expect("write garbage");
std::fs::write(&path, b"not valid postcard data!!!").expect("write garbage");
let result = try_load(hash);
let path_still_exists = path.exists();
let cache_dir = path.parent().unwrap().to_path_buf();
Expand Down
6 changes: 3 additions & 3 deletions src/lib/mlxcel-core/src/lang_analyzer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
//! The module is structured in sub-issues:
//! - **B2 (this file, initial)**: `Script` enum, `classify_token`, helper predicates.
//! - **B3** (added in the same file): `TokenScriptInfo`, `TokenLanguageIndex`, `build()`.
//! - **B4** (`cache` submodule): disk cache for `TokenLanguageIndex` (vocab-hash keyed, bincode v1).
//! - **B4** (`cache` submodule): disk cache for `TokenLanguageIndex` (vocab-hash keyed, postcard 1.x).

pub mod cache;
pub use cache::{cache_path, load_or_build, save, try_load};
Expand Down Expand Up @@ -284,8 +284,8 @@ pub enum LangAnalyzerError {
Io(#[from] std::io::Error),
#[error("tokenizer.json not found at path: {0}")]
TokenizerJsonNotFound(String),
#[error("bincode serialization error: {0}")]
Bincode(#[from] bincode::Error),
#[error("postcard serialization error: {0}")]
Postcard(#[from] postcard::Error),
#[error("unknown language code '{0}'; expected one of: ja zh ko en ru ar th hi he el")]
UnknownLanguageCode(String),
}
Expand Down
Loading