[Bugfix] CosyVoice3: wrap ref_text in instruction template (#4644) (#4756)

linyueqian · web-flow · commit 92f72085597b · 2026-06-27T14:38:33.000-07:00
diff --git a/tests/model_executor/stage_input_processors/test_cosyvoice3_stage_input_processors.py b/tests/model_executor/stage_input_processors/test_cosyvoice3_stage_input_processors.py
@@ -91,7 +91,11 @@ def test_text2flow_token_only_strips_reference_speech_prefix_from_cumulative_ids
     assert outputs[0]["additional_information"]["ids"]["prompt"] == [10, 11]
 
 
-def test_text2flow_token_only_marks_prompt_trim_for_stop_token_completion():
+def test_text2flow_token_only_does_not_mark_prompt_trim():
+    # The talker prompt is wrapped with the CosyVoice3 instruction template in
+    # _build_cosyvoice3_prompt, so the talker emits target-only speech and no
+    # prompt-trim offset is required; the flow stage trims prompt_feat itself
+    # (issue #4644). Confirm no talker_prefill_offset is set.
     source_outputs = [
         _source_output(
             "req-stop",
@@ -104,7 +108,8 @@ def test_text2flow_token_only_marks_prompt_trim_for_stop_token_completion():
     outputs = text2flow_token_only(source_outputs=source_outputs, prompt=None)
 
     assert outputs[0]["prompt_token_ids"] == [1, 2, 6562]
-    assert outputs[0]["additional_information"]["meta"]["talker_prefill_offset"] == 2
+    meta = outputs[0]["additional_information"].get("meta") or {}
+    assert "talker_prefill_offset" not in meta
 
 
 def test_text2flow_full_payload_does_not_send_codec_ids():
diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py
@@ -82,6 +82,11 @@
 _QWEN3_TTS_MODEL_STAGES = {"qwen3_tts"}
 _FISH_TTS_MODEL_STAGES = {"fish_speech_slow_ar"}
 _COSYVOICE3_TTS_MODEL_STAGES = {"cosyvoice3_talker"}
+# CosyVoice3 talker expects its reference transcript wrapped in the model
+# instruction template; without the delimiter the talker re-speaks the
+# reference (issue #4644). Matches the offline example/test and upstream demo.
+_COSYVOICE3_PROMPT_DELIMITER = "<|endofprompt|>"
+_COSYVOICE3_PROMPT_PREFIX = f"You are a helpful assistant.{_COSYVOICE3_PROMPT_DELIMITER}"
 _OMNIVOICE_TTS_MODEL_STAGES = {"omnivoice_generator"}
 _COVO_AUDIO_MODEL_STAGES = {"fused_thinker_talker"}
 _VOXCPM2_TTS_MODEL_STAGES = {"latent_generator"}
@@ -3208,8 +3213,14 @@ async def _build_cosyvoice3_prompt(
         wav_samples, sr = await self._resolve_ref_audio(request.ref_audio)
         audio_data = (np.asarray(wav_samples, dtype=np.float32), sr)
 
+        # Wrap the reference transcript in the CosyVoice3 instruction template
+        # so the talker emits target-only speech (see _COSYVOICE3_PROMPT_PREFIX).
+        # Skip if the caller already supplied a formatted prompt_text.
+        ref_text = request.ref_text or ""
+        if _COSYVOICE3_PROMPT_DELIMITER not in ref_text:
+            ref_text = f"{_COSYVOICE3_PROMPT_PREFIX}{ref_text}"
         mm_kwargs: dict[str, Any] = {
-            "prompt_text": request.ref_text,
+            "prompt_text": ref_text,
             "sample_rate": sr,
         }
         # Pass voice metadata for caching in the processor
diff --git a/vllm_omni/model_executor/stage_input_processors/cosyvoice3.py b/vllm_omni/model_executor/stage_input_processors/cosyvoice3.py
@@ -21,8 +21,6 @@
 
 logger = init_logger(__name__)
 
-_COSYVOICE3_SPEECH_TOKEN_SIZE = 6561
-
 
 def _build_prompt_embed_struct(prompt_payload: dict[str, Any]) -> EmbeddingsStruct | None:
     """Wrap prompt_payload's flat speech_token/speech_feat/embedding tensors into EmbeddingsStruct."""
@@ -85,20 +83,6 @@ def _prompt_speech_token_ids(multi_modal_data: dict[str, Any]) -> list[int]:
     return _to_token_id_list(speech_token)
 
 
-def _has_speech_stop_token(output_ids: list[Any]) -> bool:
-    return any(token_id >= _COSYVOICE3_SPEECH_TOKEN_SIZE for token_id in _to_token_id_list(output_ids))
-
-
-def _set_non_stream_prompt_trim(additional_info: dict[str, Any], prompt_speech_len: int) -> None:
-    if prompt_speech_len <= 0:
-        return
-    meta = additional_info.get("meta")
-    if not isinstance(meta, dict):
-        meta = {}
-        additional_info["meta"] = meta
-    meta["talker_prefill_offset"] = prompt_speech_len
-
-
 def _to_cpu_tensor(x: Any) -> torch.Tensor | None:
     if isinstance(x, list):
         if not x:
@@ -154,8 +138,6 @@ def text2flow(
         output_ids = _strip_prompt_prefix(raw_output_ids, prefix_ids)
         output_ids = _strip_prompt_prefix(output_ids, prompt_speech_ids)
         additional_info = dict(multi_modal_data)
-        if _has_speech_stop_token(raw_output_ids):
-            _set_non_stream_prompt_trim(additional_info, len(prompt_speech_ids))
         additional_info.setdefault("ids", {})["prompt"] = prefix_ids
         engine_inputs.append(OmniTokensPrompt(prompt_token_ids=output_ids, additional_information=additional_info))
     return engine_inputs
@@ -389,8 +371,6 @@ def text2flow_token_only(
         prompt_speech_ids = _prompt_speech_token_ids(multi_modal_data)
         output_ids = _strip_prompt_prefix(output_ids, prompt_speech_ids)
         additional_info: dict[str, Any] = dict(multi_modal_data)
-        if _has_speech_stop_token(raw_output_ids):
-            _set_non_stream_prompt_trim(additional_info, len(prompt_speech_ids))
         additional_info.setdefault("ids", {})["prompt"] = prefix_ids
         engine_inputs.append(
             OmniTokensPrompt(