Skip to content

Commit 99f930f

Browse files
wehosHongzhi Wenclaude
authored
fix(livestream): free 路 livestream 走原生视觉,不再绕分析通道 (#1508)
livestream 主播自建 server_prefix 上游同为 Gemini 系,原生视觉与发图协议 (input_image_buffer.append)与 lanlan.app+free 一致。原判定只认 base_url 含 lanlan.app,导致 livestream free(base_url 已派生为自建地址)被当成无原生视觉, 每帧丢进 VISION_MODEL 分析通道转文字,多一次模型调用且画面延迟。 抽出共享谓词 _is_free_proxy(free + (lanlan.app 或 livestream)),统一三处原先 散落的 lanlan.app+free 字面判断:_supports_native_image、stream_image 实时发图、 prompt_ephemeral 主动注入截图。lanlan.tech free(StepFun 上游)行为不变。 Co-authored-by: Hongzhi Wen <cartabio.coder1@gmail.com> Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 6b3700f commit 99f930f

2 files changed

Lines changed: 67 additions & 8 deletions

File tree

main_logic/omni_realtime_client.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -416,12 +416,21 @@ def __init__(
416416
and not bool(livestream_mode)
417417
)
418418

419+
# free 经 Gemini 代理(OpenAI-realtime 协议,发图走 input_image_buffer.append、
420+
# 服务端 VAD 由代理吞掉):lanlan.app 海外节点,或 livestream 主播自建 server_prefix。
421+
# 二者上游同为 Gemini 系,原生视觉与发图协议一致;lanlan.tech free 上游是
422+
# StepFun(无原生视觉,走 VISION_MODEL 分析通道),不在此列。
423+
self._is_free_proxy = 'free' in self._model_lower and (
424+
'lanlan.app' in (base_url or '')
425+
or bool(livestream_mode)
426+
)
427+
419428
# Whether this client supports native image input
420-
# qwen/glm/gpt/gemini have native vision; lanlan.app replacement server (free, non-mainland) also does
429+
# qwen/glm/gpt/gemini have native vision; free Gemini-proxy (lanlan.app / livestream) also does
421430
self._supports_native_image = (
422431
any(m in self._model_lower for m in ['qwen', 'glm', 'gpt'])
423432
or self._is_gemini
424-
or ('lanlan.app' in (base_url or '') and 'free' in self._model_lower)
433+
or self._is_free_proxy
425434
)
426435
self._gemini_client = None # genai.Client instance
427436
self._gemini_session = None # Live session from SDK
@@ -1372,7 +1381,7 @@ async def stream_image(self, image_b64: str) -> None:
13721381
self._fatal_error_occurred = True
13731382
return
13741383

1375-
if ('lanlan.app' in self.base_url and 'free' in self._model_lower):
1384+
if self._is_free_proxy:
13761385
append_event = {
13771386
"type": "input_image_buffer.append" ,
13781387
"image": image_b64
@@ -1787,7 +1796,7 @@ async def prompt_ephemeral(
17871796
}],
17881797
},
17891798
})
1790-
elif "qwen" in self._model_lower or ("lanlan.app" in self.base_url and "free" in self._model_lower):
1799+
elif "qwen" in self._model_lower or self._is_free_proxy:
17911800
await self.send_event({
17921801
"type": "input_image_buffer.append",
17931802
"image": snapshot_image_b64,

tests/unit/test_video_session.py

Lines changed: 54 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -136,14 +136,64 @@ async def test_non_native_vision_fallback():
136136
client = _make_client("step-realtime", supports_native_image=False)
137137
# Mark the image description as "analyzing" to trigger the vision model path
138138
client._image_description = "实时屏幕截图或相机画面正在分析中"
139-
139+
140140
# Mock the _analyze_image_with_vision_model method
141141
client._analyze_image_with_vision_model = AsyncMock()
142-
142+
143143
await client.stream_image(DUMMY_IMAGE_B64)
144-
144+
145145
# Should have called vision model fallback
146146
assert client._analyze_image_with_vision_model.called
147147
assert client._analyze_image_with_vision_model.call_args[0][0] == DUMMY_IMAGE_B64
148-
148+
149+
await client.close()
150+
151+
152+
@pytest.mark.unit
153+
def test_livestream_free_supports_native_vision():
154+
"""Livestream 主播自建 server_prefix 上游同为 Gemini 系,free 路应被判定为原生视觉,
155+
哪怕 base_url 既不含 lanlan.app 也不含 lanlan.tech(已被派生为自建地址)。"""
156+
client = OmniRealtimeClient(
157+
base_url="ws://streamer.example:8080/tok/core",
158+
api_key="test-key",
159+
model="free-model",
160+
turn_detection_mode=TurnDetectionMode.SERVER_VAD,
161+
api_type="free",
162+
livestream_mode=True,
163+
)
164+
assert client._is_free_proxy is True
165+
assert client._supports_native_image is True
166+
# livestream 自建上游是 Gemini 系,不应被当成有 server VAD 的 StepFun proxy
167+
assert client._has_server_vad is False
168+
169+
170+
@pytest.mark.unit
171+
async def test_livestream_free_image_streaming():
172+
"""Livestream free 发图应走 input_image_buffer.append(Gemini 代理协议),
173+
不落入 VISION_MODEL 分析通道。"""
174+
client = OmniRealtimeClient(
175+
base_url="ws://streamer.example:8080/tok/core",
176+
api_key="test-key",
177+
model="free-model",
178+
turn_detection_mode=TurnDetectionMode.SERVER_VAD,
179+
api_type="free",
180+
livestream_mode=True,
181+
)
182+
client.ws = AsyncMock()
183+
client._audio_in_buffer = True
184+
client._last_native_image_time = 0
185+
client._analyze_image_with_vision_model = AsyncMock()
186+
187+
await client.stream_image(DUMMY_IMAGE_B64)
188+
189+
assert not client._analyze_image_with_vision_model.called, (
190+
"Livestream free 不应走分析通道"
191+
)
192+
image_event_found = False
193+
for call_args in client.ws.send.call_args_list:
194+
msg = json.loads(call_args[0][0])
195+
if msg.get("type") == "input_image_buffer.append":
196+
image_event_found = True
197+
assert msg["image"] == DUMMY_IMAGE_B64
198+
assert image_event_found, "Expected input_image_buffer.append event for livestream free"
149199
await client.close()

0 commit comments

Comments
 (0)