Skip to content

Commit 78389d5

Browse files
authored
fix(tokenizer): serialize DeepSeek V3.2/V4 prompt JSON with json.dumps spacing (#1839)
Signed-off-by: key4ng <rukeyang@gmail.com>
1 parent 4f1616d commit 78389d5

4 files changed

Lines changed: 113 additions & 5 deletions

File tree

crates/tokenizer/src/encoders/deepseek_v32.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -141,11 +141,11 @@ fn user_msg(content: &str) -> String {
141141
// JSON helpers
142142
// ---------------------------------------------------------------------------
143143

144-
/// Mirrors the Python `to_json` helper. serde_json always emits valid UTF-8
145-
/// without escaping, so the `ensure_ascii` fallback in the Python version is
146-
/// effectively a no-op here.
144+
/// Mirrors the Python `to_json` helper: `json.dumps(value, ensure_ascii=False)`,
145+
/// which uses spaced `", "` / `": "` separators. Compact `serde_json::to_string`
146+
/// would change the prompt bytes vLLM renders from.
147147
fn to_json(value: &Value) -> String {
148-
serde_json::to_string(value).unwrap_or_else(|_| "null".to_string())
148+
crate::json_dumps::to_string(value)
149149
}
150150

151151
/// `[tool["function"] for tool in tools]`

crates/tokenizer/src/encoders/deepseek_v4.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,8 +137,11 @@ You MUST strictly follow the above defined tool name and parameter schemas to in
137137
// ---------------------------------------------------------------------------
138138
// JSON helpers (mirror V3.2)
139139
// ---------------------------------------------------------------------------
140+
// Python's `to_json` is `json.dumps(value, ensure_ascii=False)`: spaced
141+
// separators, raw UTF-8. Compact `serde_json::to_string` would change the
142+
// prompt bytes vLLM trained on.
140143
fn to_json(value: &Value) -> String {
141-
serde_json::to_string(value).unwrap_or_else(|_| "null".to_string())
144+
crate::json_dumps::to_string(value)
142145
}
143146
fn tools_from_openai_format(tools: &[Value]) -> Vec<Value> {
144147
tools

crates/tokenizer/src/json_dumps.rs

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
//! Serialize a [`serde_json::Value`] like Python's `json.dumps(value, ensure_ascii=False)`:
2+
//! spaced `", "` / `": "` separators, single line, raw UTF-8.
3+
//!
4+
//! serde_json only ships compact (`{"a":1}`) and pretty (multi-line) formatters,
5+
//! neither of which matches `json.dumps`. The DeepSeek V3.2/V4 prompt encoders
6+
//! embed tool schemas and argument values as JSON, and vLLM's reference encoder
7+
//! uses `json.dumps`, so compact output would shift the model off its training
8+
//! distribution. This adds just the spacing serde_json lacks.
9+
10+
use std::io;
11+
12+
use serde::Serialize;
13+
use serde_json::{
14+
ser::{Formatter, Serializer},
15+
Value,
16+
};
17+
18+
/// `serde_json` formatter that adds Python `json.dumps` default separator spacing.
19+
struct PythonDefaultFormatter;
20+
21+
impl Formatter for PythonDefaultFormatter {
22+
fn begin_object_value<W: ?Sized + io::Write>(&mut self, w: &mut W) -> io::Result<()> {
23+
w.write_all(b": ")
24+
}
25+
26+
fn begin_object_key<W: ?Sized + io::Write>(
27+
&mut self,
28+
w: &mut W,
29+
first: bool,
30+
) -> io::Result<()> {
31+
if first {
32+
Ok(())
33+
} else {
34+
w.write_all(b", ")
35+
}
36+
}
37+
38+
fn begin_array_value<W: ?Sized + io::Write>(
39+
&mut self,
40+
w: &mut W,
41+
first: bool,
42+
) -> io::Result<()> {
43+
if first {
44+
Ok(())
45+
} else {
46+
w.write_all(b", ")
47+
}
48+
}
49+
}
50+
51+
/// Serialize `value` like `json.dumps(value, ensure_ascii=False)`.
52+
pub(crate) fn to_string(value: &Value) -> String {
53+
let mut buf = Vec::new();
54+
let mut ser = Serializer::with_formatter(&mut buf, PythonDefaultFormatter);
55+
if value.serialize(&mut ser).is_err() {
56+
return "null".to_string();
57+
}
58+
String::from_utf8(buf).unwrap_or_else(|_| "null".to_string())
59+
}
60+
61+
#[cfg(test)]
62+
mod tests {
63+
use serde_json::json;
64+
65+
use super::*;
66+
67+
#[test]
68+
fn matches_python_json_dumps() {
69+
// json.dumps({...}, ensure_ascii=False): spaced separators, raw unicode.
70+
let v = json!({"name": "get_weather", "args": [1, 2, {"a": true}], "city": "广州"});
71+
assert_eq!(
72+
to_string(&v),
73+
r#"{"name": "get_weather", "args": [1, 2, {"a": true}], "city": "广州"}"#
74+
);
75+
}
76+
77+
#[test]
78+
fn empty_containers() {
79+
assert_eq!(to_string(&json!({})), "{}");
80+
assert_eq!(to_string(&json!([])), "[]");
81+
}
82+
83+
#[test]
84+
fn separators_inside_content_are_left_untouched() {
85+
// `,` / `:` inside keys and values must not be spaced — only the
86+
// structural separators are. Each matches json.dumps(ensure_ascii=False).
87+
assert_eq!(to_string(&json!({"a": "x, y: z"})), r#"{"a": "x, y: z"}"#);
88+
assert_eq!(to_string(&json!({"k:1": "v,2"})), r#"{"k:1": "v,2"}"#);
89+
// A value that *is* a separator string.
90+
assert_eq!(
91+
to_string(&json!({"sep": ", ", "kv": ": "})),
92+
r#"{"sep": ", ", "kv": ": "}"#
93+
);
94+
// A value that is itself a JSON-looking string (stays escaped, unspaced).
95+
assert_eq!(
96+
to_string(&json!({"s": "{\"inner\": 1, \"b\": [2, 3]}"})),
97+
r#"{"s": "{\"inner\": 1, \"b\": [2, 3]}"}"#
98+
);
99+
assert_eq!(
100+
to_string(&json!(["a,b", "c:d", "e, f: g"])),
101+
r#"["a,b", "c:d", "e, f: g"]"#
102+
);
103+
}
104+
}

crates/tokenizer/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ pub mod encoders;
77
pub mod eos;
88
pub mod factory;
99
pub mod hub;
10+
pub(crate) mod json_dumps;
1011
pub mod mock;
1112
pub mod registry;
1213
pub mod sequence;

0 commit comments

Comments
 (0)