Skip to content

Commit 183c758

Browse files
committed
use tokenizer.bos and .eos for output formatting
1 parent 52f565b commit 183c758

File tree

1 file changed

+6
-3
lines changed

1 file changed

+6
-3
lines changed

torchtitan/experiments/deepseek_v3/generate.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -123,10 +123,13 @@ def generate(mesh: DeviceMesh, messages: list[dict], n_tokens: int = 50):
123123
if rank == 0:
124124
output = tokenizer.decode(x[0])
125125
# Clean up the output by removing special tokens
126-
output = output.replace("<|begin▁of▁sentence|>", "")
126+
bos = tokenizer.bos_token
127+
output = output.replace(bos, "")
127128
# Truncate at end of sentence token (might not be correct termination)
128-
if "<|end▁of▁sentence|>" in output:
129-
output = output.split("<|end▁of▁sentence|>")[0]
129+
# Use tokenizer's EOS token for more portable code
130+
eos_token = tokenizer.eos_token
131+
if eos_token and eos_token in output:
132+
output = output.split(eos_token)[0]
130133

131134
print(f"Output: {output}")
132135

0 commit comments

Comments
 (0)