Merge pull request #33 from NVIDIA-AI-IOT/feat/demo

tokk-nv · web-flow · commit aa55415cd90a · 2026-03-14T12:46:01.000-07:00
Update installation instructions and refine system prompts across presets
diff --git a/INSTALL.md b/INSTALL.md
@@ -65,6 +65,11 @@ sudo apt-get install -y portaudio19-dev
 
 ### 2. Virtual Environment
 
+> **Note:** If `python3 -m venv` fails with "No module named venv", install it first:
+> ```bash
+> sudo apt install python3.12-venv
+> ```
+
 ```bash
 # Create venv
 python3 -m venv .venv
@@ -392,6 +397,11 @@ The second volume `-v ${HOME}/.cache/vllm:/root/.cache/vllm` persists vLLM’s *
 >
 > **Memory tuning**: On shared-memory systems (Jetson), lower `--gpu-memory-utilization` to leave room for the OS, Riva, and the application. On discrete GPUs with dedicated VRAM, `0.8` is safe.
 >
+> **GPU memory cleanup**: If vLLM fails to start with an OOM error after stopping another GPU container, free cached memory first:
+> ```bash
+> sudo sysctl -w vm.drop_caches=3
+> ```
+>
 > **Desktop GPU / x86_64**: Use `vllm/vllm-openai:latest` or `nvcr.io/nvidia/vllm:latest` instead of the Jetson image.
 
 ### vLLM troubleshooting
diff --git a/presets/cosmos-reason.yaml b/presets/cosmos-reason.yaml
@@ -20,10 +20,10 @@ llm:
   temperature: 0.3          # Low temp — critical for precise, consistent vision responses
   max_tokens: 512           # Hard cap on reasoning+answer combined; model uses ~150-275 total
   history_turns: 0           # Disabled — text-only history anchors VLM to prior answers
-  system_prompt: "You analyze live video from the user's camera. Answer based on what you see. Be precise and concise — 1-2 short sentences only."
+  system_prompt: "You are a helpful voice AI assistant. Plain text only, no markdown, no bullet points, no emojis."
   enable_vision: true
-  vision_system_prompt: "You analyze live video from the user's camera. Answer based on what you see. Be precise and concise — 1-2 short sentences only."
-  vision_frames: 100         # Video mode: request many frames for temporal video encoding
+  vision_system_prompt: "You are a helpful voice and vision assistant. Give ONE short sentence answers only. Be direct. Plain text only, no markdown, no bullet points, no emojis."
+  vision_frames: 30         # Video mode: request many frames for temporal video encoding
   vision_detail: auto
   vision_quality: 0.8
   vision_max_width: 768
diff --git a/presets/default.yaml b/presets/default.yaml
@@ -27,12 +27,11 @@ llm:
   max_tokens: 512
   minimal_output: false
   stream: true
-  system_prompt: You are a helpful voice assistant.
+  system_prompt: "You are a helpful voice AI assistant. Plain text only, no markdown, no bullet points, no emojis."
   extra_request_body: ''
   cheap_model: nvidia/cosmos-reason2-8b-fp8
   enable_vision: true
-  vision_system_prompt: You are a vision assistant. Give ONE short sentence answers
-    only. Be direct. No explanations. Use plain text only — no markdown or formatting.
+  vision_system_prompt: "You are a helpful voice and vision assistant. Give ONE short sentence answers only. Be direct. Plain text only, no markdown, no bullet points, no emojis."
   vision_detail: auto
   vision_frames: 10
   vision_quality: 0.8
diff --git a/presets/high-accuracy.yaml b/presets/high-accuracy.yaml
@@ -18,7 +18,7 @@ llm:
   model: llama3.1:8b  # Larger, more capable model
   temperature: 0.5    # Lower temperature for more consistent responses
   max_tokens: 1024    # Allow longer, detailed responses
-  system_prompt: "You are a knowledgeable voice assistant. Provide thorough, accurate responses."
+  system_prompt: "You are a helpful voice AI assistant. Plain text only, no markdown, no bullet points, no emojis."
 
 tts:
   scheme: riva
diff --git a/presets/llm-router.yaml b/presets/llm-router.yaml
@@ -0,0 +1,50 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+name: "LLM Router (MoM + Local VLM)"
+description: "Remote MoM model via LLM router with local Edge LLM as utility model for titles and vision"
+
+asr:
+  scheme: riva
+  server: localhost:50051
+  model: parakeet-1.1b-en-US-asr-streaming-silero-vad-sortformer
+  language: en-US
+  vad_start_threshold: 0.5
+  vad_stop_threshold: 0.3
+  speech_pad_ms: 600
+  speech_timeout_ms: 1200
+
+llm:
+  scheme: openai
+  api_base: http://10.110.51.30:8801/v1
+  model: MoM
+  temperature: 0.3
+  max_tokens: 512
+  history_turns: 0
+  system_prompt: "You are a helpful voice AI assistant. Plain text only, no markdown, no bullet points, no emojis."
+  enable_vision: true
+  vision_system_prompt: "You are a helpful voice and vision assistant. Give ONE short sentence answers only. Be direct. Plain text only, no markdown, no bullet points, no emojis."
+  vision_frames: 30
+  vision_detail: auto
+  vision_quality: 0.8
+  vision_max_width: 768
+  vision_buffer_fps: 5.0
+  vision_video_encode: true
+  enable_reasoning: true
+
+tts:
+  scheme: riva
+  server: localhost:50051
+  voice: ""
+  sample_rate: 22050
+  stream_tts: true
+
+devices:
+  audio_input_source: browser
+  audio_output_source: browser
+
+app:
+  barge_in_enabled: true
+  timeline_position: right
+  session_auto_save: true
+  session_output_dir: ./sessions
+  theme: dark
diff --git a/presets/low-latency.yaml b/presets/low-latency.yaml
@@ -18,7 +18,7 @@ llm:
   model: llama3.2:3b  # Fast small model
   temperature: 0.7
   max_tokens: 256     # Shorter responses
-  system_prompt: "You are a helpful voice assistant. Keep responses concise."
+  system_prompt: "You are a helpful voice AI assistant. Plain text only, no markdown, no bullet points, no emojis."
 
 tts:
   scheme: riva
diff --git a/presets/openai-realtime.yaml b/presets/openai-realtime.yaml
@@ -19,7 +19,7 @@ llm:
   model: gpt-4o-realtime-preview
   temperature: 0.7
   max_tokens: 512
-  system_prompt: "You are a helpful voice assistant."
+  system_prompt: "You are a helpful voice AI assistant. Plain text only, no markdown, no bullet points, no emojis."
 
 tts:
   scheme: openai-realtime
diff --git a/presets/tensorrt-edge-cosmos.yaml b/presets/tensorrt-edge-cosmos.yaml
@@ -1,33 +1,36 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 name: "Cosmos-Reason2 (TensorRT Edge LLM)"
-description: "Cosmos-Reason2 on TensorRT Edge LLM backend — optimized edge inference with image input"
+description: "Cosmos-Reason2 on TensorRT Edge LLM backend — optimized edge inference with video input"
 
 asr:
   scheme: riva
   server: localhost:50051
-  model: conformer
+  model: parakeet-1.1b-en-US-asr-streaming-silero-vad-sortformer
   language: en-US
   vad_start_threshold: 0.5
   vad_stop_threshold: 0.3
-  speech_timeout_ms: 500
+  speech_pad_ms: 600
+  speech_timeout_ms: 1200
 
 llm:
   scheme: openai
   api_base: http://localhost:58010/v1
-  model: qwen3-vl
+  model: /workspace/cosmos_onnx/visual-fp16
   temperature: 0.3
   max_tokens: 512
+  history_turns: 0
   enable_reasoning: false
-  system_prompt: "You are a vision assistant observing the user through a live camera. Answer directly in one short sentence. Do not think step-by-step or explain your reasoning."
+  extra_request_body: '{"chat_template_kwargs": {"enable_thinking": false}}'
+  system_prompt: "You are a helpful voice AI assistant. Plain text only, no markdown, no bullet points, no emojis."
   enable_vision: true
-  vision_system_prompt: "You are a vision assistant observing the user through a live camera. Answer directly in one short sentence. Do not think step-by-step or explain your reasoning."
-  vision_frames: 3
+  vision_system_prompt: "You are a helpful voice and vision assistant. Give ONE short sentence answers only. Be direct. Plain text only, no markdown, no bullet points, no emojis."
+  vision_frames: 30
   vision_detail: auto
-  vision_quality: 0.7
-  vision_max_width: 640
-  vision_buffer_fps: 3.0
-  vision_video_encode: false
+  vision_quality: 0.8
+  vision_max_width: 768
+  vision_buffer_fps: 5.0
+  vision_video_encode: true
 
 tts:
   scheme: riva
@@ -37,7 +40,6 @@ tts:
   stream_tts: true
 
 devices:
-  video_source: browser
   audio_input_source: browser
   audio_output_source: browser
 
diff --git a/presets/text-only.yaml b/presets/text-only.yaml
@@ -12,7 +12,7 @@ llm:
   model: llama3.2:3b
   temperature: 0.7
   max_tokens: 512
-  system_prompt: "You are a helpful AI assistant."
+  system_prompt: "You are a helpful voice AI assistant. Plain text only, no markdown, no bullet points, no emojis."
 
 tts:
   scheme: none
diff --git a/src/multi_modal_ai_studio/config/schema.py b/src/multi_modal_ai_studio/config/schema.py
@@ -139,7 +139,7 @@ class LLMConfig:
     temperature: float = 0.7
     max_tokens: int = 512
     minimal_output: bool = False
-    system_prompt: str = "You are a helpful voice assistant."
+    system_prompt: str = "You are a helpful voice AI assistant. Plain text only, no markdown, no bullet points, no emojis."
     extra_request_body: Optional[str] = None
     top_p: float = 1.0
     frequency_penalty: float = 0
@@ -153,7 +153,7 @@ class LLMConfig:
     # When enable_vision=True, camera frames are captured and sent with each prompt
     # -------------------------------------------------------------------------
     enable_vision: bool = False  # Set True for VLM models (Cosmos-Reason, LLaVA, GPT-4V, etc.)
-    vision_system_prompt: str = "You are a vision assistant. Give ONE short sentence answers only. Be direct. No explanations."
+    vision_system_prompt: str = "You are a helpful voice and vision assistant. Give ONE short sentence answers only. Be direct. Plain text only, no markdown, no bullet points, no emojis."
     vision_detail: Literal["low", "high", "auto"] = "auto"  # OpenAI vision detail level
     vision_frames: int = 4       # Frames per turn (1=single at speech end, 2-10=during speech)
     vision_quality: float = 0.7  # JPEG quality (0.3=fast/small, 1.0=high quality)
diff --git a/src/multi_modal_ai_studio/devices/local.py b/src/multi_modal_ai_studio/devices/local.py
@@ -23,18 +23,41 @@ def _can_capture_video(device_path: str) -> bool:
     
     Some cameras create multiple /dev/video* nodes where only one is the actual
     capture device (others are metadata/control devices).
+    
+    Uses V4L2 ioctl to check capabilities without opening the device exclusively,
+    so it doesn't conflict with active camera streams.
     """
+    import fcntl
+    import struct
+
+    VIDIOC_QUERYCAP = 0x80685600
+    V4L2_CAP_VIDEO_CAPTURE = 0x00000001
+
     try:
-        import cv2
-        cap = cv2.VideoCapture(device_path)
-        if not cap.isOpened():
-            return False
-        ret, _ = cap.read()
-        cap.release()
-        return ret
+        fd = open(device_path, "rb")
+        try:
+            buf = bytearray(104)
+            fcntl.ioctl(fd, VIDIOC_QUERYCAP, buf)
+            capabilities = struct.unpack_from("<I", buf, 84)[0]
+            device_caps_field = struct.unpack_from("<I", buf, 88)[0]
+            caps = device_caps_field if device_caps_field else capabilities
+            return bool(caps & V4L2_CAP_VIDEO_CAPTURE)
+        finally:
+            fd.close()
+    except (OSError, IOError) as e:
+        logger.debug("V4L2 capability check for %s: %s", device_path, e)
+        # If ioctl fails (e.g. device busy), fall back to checking sysfs
+        try:
+            dev_name = Path(device_path).name
+            index_path = Path(f"/sys/class/video4linux/{dev_name}/index")
+            if index_path.exists():
+                return index_path.read_text().strip() == "0"
+        except Exception:
+            pass
+        return True
     except Exception as e:
         logger.debug("Check capture capability for %s: %s", device_path, e)
-        return False
+        return True
 
 
 def list_local_cameras() -> List[Dict[str, str]]:
diff --git a/src/multi_modal_ai_studio/webui/static/app.js b/src/multi_modal_ai_studio/webui/static/app.js
@@ -5074,15 +5074,27 @@ function startPreviewStream(options) {
         var streamUrl = getApiBase() + '/api/camera/stream?device=' + deviceParam;
         var wsUrl = (getApiBase().replace(/^https/, 'wss').replace(/^http/, 'ws') || ('wss://' + window.location.host)) + '/ws/camera-webrtc?device=' + deviceParam;
         function fallbackToMjpeg() {
-            if (mjpegFeed) {
-                mjpegFeed.src = streamUrl;
-                mjpegFeed.style.display = 'block';
+            // Close WebRTC so the server releases the camera device before MJPEG opens it
+            if (state.cameraWebrtcWs) {
+                try { state.cameraWebrtcWs.close(); } catch (e) {}
+                state.cameraWebrtcWs = null;
+            }
+            if (state.cameraWebrtcPc) {
+                try { state.cameraWebrtcPc.close(); } catch (e) {}
+                state.cameraWebrtcPc = null;
             }
             if (videoFeed) {
                 videoFeed.src = '';
                 videoFeed.srcObject = null;
                 videoFeed.style.display = 'none';
             }
+            // Small delay to let the server release the camera device
+            setTimeout(function () {
+                if (mjpegFeed) {
+                    mjpegFeed.src = streamUrl;
+                    mjpegFeed.style.display = 'block';
+                }
+            }, 500);
             if (imagePlaceholder) imagePlaceholder.style.display = 'none';
         }
         var pc = new RTCPeerConnection();