fix(livestream): free 路 livestream 走原生视觉，不再绕分析通道 (#1508)

wehos · Hongzhi Wen · claude · web-flow · commit 99f930fc5703 · 2026-05-25T04:43:54.000+08:00
livestream 主播自建 server_prefix 上游同为 Gemini 系，原生视觉与发图协议
（input_image_buffer.append）与 lanlan.app+free 一致。原判定只认 base_url 含
lanlan.app，导致 livestream free（base_url 已派生为自建地址）被当成无原生视觉，
每帧丢进 VISION_MODEL 分析通道转文字，多一次模型调用且画面延迟。

抽出共享谓词 _is_free_proxy（free + (lanlan.app 或 livestream)），统一三处原先
散落的 lanlan.app+free 字面判断：_supports_native_image、stream_image 实时发图、
prompt_ephemeral 主动注入截图。lanlan.tech free（StepFun 上游）行为不变。

Co-authored-by: Hongzhi Wen &lt;cartabio.coder1@gmail.com&gt;
Co-authored-by: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/main_logic/omni_realtime_client.py b/main_logic/omni_realtime_client.py
@@ -416,12 +416,21 @@ def __init__(
             and not bool(livestream_mode)
         )
 
+        # free 经 Gemini 代理（OpenAI-realtime 协议，发图走 input_image_buffer.append、
+        # 服务端 VAD 由代理吞掉）：lanlan.app 海外节点，或 livestream 主播自建 server_prefix。
+        # 二者上游同为 Gemini 系，原生视觉与发图协议一致；lanlan.tech free 上游是
+        # StepFun（无原生视觉，走 VISION_MODEL 分析通道），不在此列。
+        self._is_free_proxy = 'free' in self._model_lower and (
+            'lanlan.app' in (base_url or '')
+            or bool(livestream_mode)
+        )
+
         # Whether this client supports native image input
-        # qwen/glm/gpt/gemini have native vision; lanlan.app replacement server (free, non-mainland) also does
+        # qwen/glm/gpt/gemini have native vision; free Gemini-proxy (lanlan.app / livestream) also does
         self._supports_native_image = (
             any(m in self._model_lower for m in ['qwen', 'glm', 'gpt'])
             or self._is_gemini
-            or ('lanlan.app' in (base_url or '') and 'free' in self._model_lower)
+            or self._is_free_proxy
         )
         self._gemini_client = None  # genai.Client instance
         self._gemini_session = None  # Live session from SDK
@@ -1372,7 +1381,7 @@ async def stream_image(self, image_b64: str) -> None:
                             self._fatal_error_occurred = True
                 return
 
-            if ('lanlan.app' in self.base_url and 'free' in self._model_lower):
+            if self._is_free_proxy:
                 append_event = {
                     "type": "input_image_buffer.append" ,
                     "image": image_b64
@@ -1787,7 +1796,7 @@ async def prompt_ephemeral(
                                 }],
                             },
                         })
-                    elif "qwen" in self._model_lower or ("lanlan.app" in self.base_url and "free" in self._model_lower):
+                    elif "qwen" in self._model_lower or self._is_free_proxy:
                         await self.send_event({
                             "type": "input_image_buffer.append",
                             "image": snapshot_image_b64,
diff --git a/tests/unit/test_video_session.py b/tests/unit/test_video_session.py
@@ -136,14 +136,64 @@ async def test_non_native_vision_fallback():
     client = _make_client("step-realtime", supports_native_image=False)
     # Mark the image description as "analyzing" to trigger the vision model path
     client._image_description = "实时屏幕截图或相机画面正在分析中"
-    
+
     # Mock the _analyze_image_with_vision_model method
     client._analyze_image_with_vision_model = AsyncMock()
-    
+
     await client.stream_image(DUMMY_IMAGE_B64)
-    
+
     # Should have called vision model fallback
     assert client._analyze_image_with_vision_model.called
     assert client._analyze_image_with_vision_model.call_args[0][0] == DUMMY_IMAGE_B64
-    
+
+    await client.close()
+
+
+@pytest.mark.unit
+def test_livestream_free_supports_native_vision():
+    """Livestream 主播自建 server_prefix 上游同为 Gemini 系，free 路应被判定为原生视觉，
+    哪怕 base_url 既不含 lanlan.app 也不含 lanlan.tech（已被派生为自建地址）。"""
+    client = OmniRealtimeClient(
+        base_url="ws://streamer.example:8080/tok/core",
+        api_key="test-key",
+        model="free-model",
+        turn_detection_mode=TurnDetectionMode.SERVER_VAD,
+        api_type="free",
+        livestream_mode=True,
+    )
+    assert client._is_free_proxy is True
+    assert client._supports_native_image is True
+    # livestream 自建上游是 Gemini 系，不应被当成有 server VAD 的 StepFun proxy
+    assert client._has_server_vad is False
+
+
+@pytest.mark.unit
+async def test_livestream_free_image_streaming():
+    """Livestream free 发图应走 input_image_buffer.append（Gemini 代理协议），
+    不落入 VISION_MODEL 分析通道。"""
+    client = OmniRealtimeClient(
+        base_url="ws://streamer.example:8080/tok/core",
+        api_key="test-key",
+        model="free-model",
+        turn_detection_mode=TurnDetectionMode.SERVER_VAD,
+        api_type="free",
+        livestream_mode=True,
+    )
+    client.ws = AsyncMock()
+    client._audio_in_buffer = True
+    client._last_native_image_time = 0
+    client._analyze_image_with_vision_model = AsyncMock()
+
+    await client.stream_image(DUMMY_IMAGE_B64)
+
+    assert not client._analyze_image_with_vision_model.called, (
+        "Livestream free 不应走分析通道"
+    )
+    image_event_found = False
+    for call_args in client.ws.send.call_args_list:
+        msg = json.loads(call_args[0][0])
+        if msg.get("type") == "input_image_buffer.append":
+            image_event_found = True
+            assert msg["image"] == DUMMY_IMAGE_B64
+    assert image_event_found, "Expected input_image_buffer.append event for livestream free"
     await client.close()