sgl-project · CTKnight · Apr 12, 2026 · Apr 12, 2026 · Apr 13, 2026 · Apr 13, 2026
@@ -60,6 +60,12 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument(
         "--relay-backend", type=str, default="shm", choices=["nixl", "shm"]
     )
+    parser.add_argument(
+        "--mem-fraction-static",
+        type=float,
+        default=0.7,
+        help="Static memory fraction for SGLang-backed AR stages.",
+    )
 
     # Server
     parser.add_argument("--host", type=str, default="0.0.0.0")
@@ -91,6 +97,13 @@ async def main_async(args: argparse.Namespace) -> None:
         gpu_placement=gpu_placement,
     )
 
+    server_args_overrides = {"mem_fraction_static": args.mem_fraction_static}
+    for stage in config.stages:
+        if stage.name in {"thinker", "talker_ar"}:
+            stage.executor.args.setdefault("server_args_overrides", {}).update(
+                server_args_overrides
+            )
+
     runner = MultiProcessPipelineRunner(config)
     logger.info("Starting 9-stage speech pipeline (multiprocess)...")
     await runner.start(timeout=600)

diff --git a/playground/README.md b/playground/README.md
@@ -6,6 +6,7 @@ This directory contains multiple playground interfaces for SGLang-Omni.
 |---|---|
 | `web/` | Full-featured HTML/CSS/JS UI served directly by the sglang-omni server. Supports text, audio, image, video inputs and a built-in file browser. |
 | `gradio/` | Lightweight Gradio app that connects to a running server via HTTP. Text chat with streaming, model selector, and generation parameter controls. |
+| `realtime-ws/` | Standalone websocket realtime app with server-side VAD, text input, microphone streaming, and streamed assistant audio playback. |
 | `tts/` | S2 Pro TTS Gradio app with shared controls for voice cloning plus separate streaming and non-streaming playback modes. |
 
 ## Web Playground
@@ -20,6 +21,88 @@ uv pip install -v -e .
 
 Then open `http://localhost:8000` in your browser.
 
+## Realtime WebSocket Playground
+
+Install the project before launching:
+
+```bash
+uv pip install -v -e .
+```
+
+Launch the backend plus standalone frontend app with one command:
+
+```bash
+./playground/realtime-ws/start.sh [--mock] [realtime-options] [backend-options...]
+```
+
+Minimal usable commands:
+
+```bash
+# local smoke test
+./playground/realtime-ws/start.sh --mock
+
+# real model
+./playground/realtime-ws/start.sh --model-path Qwen/Qwen3-Omni-30B-A3B-Instruct
+```
+
+In normal backend mode, pass the usual speech server flags such as `--model-path`:
+
+```bash
+./playground/realtime-ws/start.sh \
+  --model-path Qwen/Qwen3-Omni-30B-A3B-Instruct
+```
+
+Then open `http://localhost:7862`.
+
+For a browser smoke test without loading any model, launch the mock realtime API:
+
+```bash
+./playground/realtime-ws/start.sh --mock
+```
+
+That path exercises:
+
+- browser microphone capture over websocket PCM streaming
+- server-side VAD turn detection
+- automatic response start after speech stop
+- streamed assistant audio playback in the browser
+- text prompts over the same websocket session
+
+The mock backend returns canned text plus playback of the captured client audio
+(falling back to a synthetic tone when there is no input audio) instead of
+calling the inference pipeline.
+
+### Remote browser over SSH port forwarding
+
+Because the transport is plain HTTP + WebSocket, standard SSH forwarding is
+enough for remote browser testing.
+
+Example:
+
+```bash
+./playground/realtime-ws/start.sh --mock
+```
+
+Forward the backend port and the frontend port from the remote machine:
+
+```bash
+ssh -L 8000:localhost:8000 -L 7862:localhost:7862 user@host
+```
+
+For the full launcher help, run:
+
+```bash
+./playground/realtime-ws/start.sh --help
+```
+
+The websocket playground:
+
+- streams microphone PCM to the backend over `/v1/realtime/ws`
+- runs server-side VAD to auto-trigger one inference turn per utterance
+- supports manual push-to-talk and text prompts in the same session
+- streams assistant audio back over the websocket and auto-plays it in the browser
+- keeps the frontend separate from the inference API server
+
 ### Custom port
 
 ```bash
@@ -95,6 +178,7 @@ ssh -L 8000:localhost:8000 -L 7860:localhost:7860 user@host
 | `/` | Web playground UI (index.html, app.js, styles.css) |
 | `/v1/chat/completions` | Chat completions (text + audio, streaming) |
 | `/v1/audio/speech` | Text-to-speech |
+| `/v1/realtime/ws` | Realtime websocket session transport |
 | `/v1/models` | List available models |
 | `/v1/fs/list` | Browse server filesystem |
 | `/v1/fs/file` | Download a server file |