Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions crates/tokenizer/src/encoders/deepseek_v32.rs
Original file line number Diff line number Diff line change
Expand Up @@ -141,11 +141,11 @@ fn user_msg(content: &str) -> String {
// JSON helpers
// ---------------------------------------------------------------------------

/// Mirrors the Python `to_json` helper. serde_json always emits valid UTF-8
/// without escaping, so the `ensure_ascii` fallback in the Python version is
/// effectively a no-op here.
/// Mirrors the Python `to_json` helper: `json.dumps(value, ensure_ascii=False)`,
/// which uses spaced `", "` / `": "` separators. Compact `serde_json::to_string`
/// would change the prompt bytes vLLM renders from.
fn to_json(value: &Value) -> String {
serde_json::to_string(value).unwrap_or_else(|_| "null".to_string())
crate::json_dumps::to_string(value)
}

/// `[tool["function"] for tool in tools]`
Expand Down
5 changes: 4 additions & 1 deletion crates/tokenizer/src/encoders/deepseek_v4.rs
Original file line number Diff line number Diff line change
Expand Up @@ -137,8 +137,11 @@ You MUST strictly follow the above defined tool name and parameter schemas to in
// ---------------------------------------------------------------------------
// JSON helpers (mirror V3.2)
// ---------------------------------------------------------------------------
// Python's `to_json` is `json.dumps(value, ensure_ascii=False)`: spaced
// separators, raw UTF-8. Compact `serde_json::to_string` would change the
// prompt bytes vLLM trained on.
fn to_json(value: &Value) -> String {
serde_json::to_string(value).unwrap_or_else(|_| "null".to_string())
crate::json_dumps::to_string(value)
}
fn tools_from_openai_format(tools: &[Value]) -> Vec<Value> {
tools
Expand Down
104 changes: 104 additions & 0 deletions crates/tokenizer/src/json_dumps.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
//! Serialize a [`serde_json::Value`] like Python's `json.dumps(value, ensure_ascii=False)`:
//! spaced `", "` / `": "` separators, single line, raw UTF-8.
//!
//! serde_json only ships compact (`{"a":1}`) and pretty (multi-line) formatters,
//! neither of which matches `json.dumps`. The DeepSeek V3.2/V4 prompt encoders
//! embed tool schemas and argument values as JSON, and vLLM's reference encoder
//! uses `json.dumps`, so compact output would shift the model off its training
//! distribution. This adds just the spacing serde_json lacks.

use std::io;

use serde::Serialize;
use serde_json::{
ser::{Formatter, Serializer},
Value,
};

/// `serde_json` formatter that adds Python `json.dumps` default separator spacing.
struct PythonDefaultFormatter;

impl Formatter for PythonDefaultFormatter {
fn begin_object_value<W: ?Sized + io::Write>(&mut self, w: &mut W) -> io::Result<()> {
w.write_all(b": ")
}

fn begin_object_key<W: ?Sized + io::Write>(
&mut self,
w: &mut W,
first: bool,
) -> io::Result<()> {
if first {
Ok(())
} else {
w.write_all(b", ")
}
}

fn begin_array_value<W: ?Sized + io::Write>(
&mut self,
w: &mut W,
first: bool,
) -> io::Result<()> {
if first {
Ok(())
} else {
w.write_all(b", ")
}
}
}

/// Serialize `value` like `json.dumps(value, ensure_ascii=False)`.
pub(crate) fn to_string(value: &Value) -> String {
let mut buf = Vec::new();
let mut ser = Serializer::with_formatter(&mut buf, PythonDefaultFormatter);
if value.serialize(&mut ser).is_err() {
return "null".to_string();
}
String::from_utf8(buf).unwrap_or_else(|_| "null".to_string())
}

#[cfg(test)]
mod tests {
use serde_json::json;

use super::*;

#[test]
fn matches_python_json_dumps() {
// json.dumps({...}, ensure_ascii=False): spaced separators, raw unicode.
let v = json!({"name": "get_weather", "args": [1, 2, {"a": true}], "city": "广州"});
assert_eq!(
to_string(&v),
r#"{"name": "get_weather", "args": [1, 2, {"a": true}], "city": "广州"}"#
);
}

#[test]
fn empty_containers() {
assert_eq!(to_string(&json!({})), "{}");
assert_eq!(to_string(&json!([])), "[]");
}

#[test]
fn separators_inside_content_are_left_untouched() {
// `,` / `:` inside keys and values must not be spaced — only the
// structural separators are. Each matches json.dumps(ensure_ascii=False).
assert_eq!(to_string(&json!({"a": "x, y: z"})), r#"{"a": "x, y: z"}"#);
assert_eq!(to_string(&json!({"k:1": "v,2"})), r#"{"k:1": "v,2"}"#);
// A value that *is* a separator string.
assert_eq!(
to_string(&json!({"sep": ", ", "kv": ": "})),
r#"{"sep": ", ", "kv": ": "}"#
);
// A value that is itself a JSON-looking string (stays escaped, unspaced).
assert_eq!(
to_string(&json!({"s": "{\"inner\": 1, \"b\": [2, 3]}"})),
r#"{"s": "{\"inner\": 1, \"b\": [2, 3]}"}"#
);
assert_eq!(
to_string(&json!(["a,b", "c:d", "e, f: g"])),
r#"["a,b", "c:d", "e, f: g"]"#
);
}
}
1 change: 1 addition & 0 deletions crates/tokenizer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ pub mod encoders;
pub mod eos;
pub mod factory;
pub mod hub;
pub(crate) mod json_dumps;
pub mod mock;
pub mod registry;
pub mod sequence;
Expand Down
Loading