|
1 | 1 | /** |
2 | | - * Vision Tab — placeholder. |
| 2 | + * Vision Tab — V2 canonical VLM camera description. |
3 | 3 | * |
4 | | - * VLM worker bridge is at `@runanywhere/web-llamacpp/vlm-worker`. This |
5 | | - * view is awaiting WEB-08 re-land — see gaps/gaps/inconsistencies/web.md. |
| 4 | + * Re-landed against the existing `VLMWorkerBridge` (off-main-thread WASM |
| 5 | + * runtime) and the core `VideoCapture` helper. Flow is: |
| 6 | + * |
| 7 | + * 1. User downloads + loads a VLM (e.g. SmolVLM 500M) via the shared |
| 8 | + * model selection sheet (download + `modelLifecycle.load`). |
| 9 | + * 2. User starts the camera — `VideoCapture` attaches its `<video>` to |
| 10 | + * the preview container. |
| 11 | + * 3. User clicks "Capture & analyze" — the latest frame is extracted as |
| 12 | + * RGB pixels, wrapped in a `VLMImage` proto message, and dispatched |
| 13 | + * through `VLMWorkerBridge.shared.process(image, options)`. The |
| 14 | + * worker decodes on its side, calls `_rac_vlm_process_proto`, and |
| 15 | + * returns the encoded `VLMResult`. |
| 16 | + * |
| 17 | + * The worker-side `loadModel` wiring (raw GGUF + mmproj bytes transferred |
| 18 | + * zero-copy into the worker's MEMFS) is still TBD — until the backend |
| 19 | + * package installs it, `VLMWorkerBridge.shared.isModelLoaded` stays false |
| 20 | + * and the view surfaces the situation inline rather than rendering a blank |
| 21 | + * placeholder. |
6 | 22 | */ |
7 | 23 |
|
8 | 24 | import type { TabLifecycle } from '../app'; |
9 | | -import { renderFeatureUnavailable } from '../components/feature-unavailable'; |
| 25 | +import { |
| 26 | + RunAnywhere, |
| 27 | + VideoCapture, |
| 28 | + VLMImageFormat, |
| 29 | + isSDKException, |
| 30 | + type VLMGenerationOptions, |
| 31 | + type VLMImage, |
| 32 | + type VLMResult, |
| 33 | +} from '@runanywhere/web'; |
| 34 | +import { VLMWorkerBridge } from '@runanywhere/web-llamacpp'; |
| 35 | +import { |
| 36 | + ensureCatalogRegistered, |
| 37 | + onModelStateChange, |
| 38 | + openSheet, |
| 39 | +} from '../components/model-selection'; |
| 40 | + |
| 41 | +const VLM_MODEL_ID = 'smolvlm-500m-instruct-q8_0'; |
| 42 | +const DEFAULT_PROMPT = 'Describe what you see in this image.'; |
| 43 | +const CAPTURE_DIMENSION = 384; |
| 44 | + |
| 45 | +let container: HTMLElement; |
| 46 | +let camera: VideoCapture | null = null; |
| 47 | +let latestFrame: { rgbPixels: Uint8Array; width: number; height: number } | null = null; |
| 48 | +let lastResult: string | null = null; |
| 49 | +let status = ''; |
| 50 | +let isBusy = false; |
| 51 | +let unsubscribeState: (() => void) | null = null; |
10 | 52 |
|
11 | 53 | export function initVisionTab(el: HTMLElement): TabLifecycle { |
12 | | - renderFeatureUnavailable(el, { |
13 | | - title: 'Vision', |
14 | | - description: |
15 | | - 'Live VLM camera description. Returns once the proto-byte VLM bridge ' + |
16 | | - 'in `@runanywhere/web-llamacpp` re-lands and registers a VLM handle.', |
17 | | - requires: [ |
18 | | - 'VideoCapture', |
19 | | - 'VLMProtoAdapter.process', |
20 | | - 'RunAnywhere.modelLifecycle.load', |
21 | | - ], |
| 54 | + container = el; |
| 55 | + |
| 56 | + ensureCatalogRegistered(); |
| 57 | + renderView(); |
| 58 | + |
| 59 | + // Re-render when the shared model state changes so the "Load model" |
| 60 | + // button reflects real state without manual refresh. |
| 61 | + unsubscribeState = onModelStateChange(() => renderView()); |
| 62 | + |
| 63 | + return { |
| 64 | + onActivate: () => { |
| 65 | + ensureCatalogRegistered(); |
| 66 | + renderView(); |
| 67 | + }, |
| 68 | + onDeactivate: () => { |
| 69 | + stopCamera(); |
| 70 | + }, |
| 71 | + }; |
| 72 | +} |
| 73 | + |
| 74 | +// --------------------------------------------------------------------------- |
| 75 | +// Rendering |
| 76 | +// --------------------------------------------------------------------------- |
| 77 | + |
| 78 | +function renderView(): void { |
| 79 | + const bridge = VLMWorkerBridge.shared; |
| 80 | + const modelLoaded = isVLMModelLoaded(); |
| 81 | + const workerLoaded = bridge.isModelLoaded; |
| 82 | + const captureReady = camera?.isCapturing ?? false; |
| 83 | + const canAnalyze = workerLoaded && captureReady && !isBusy; |
| 84 | + |
| 85 | + container.innerHTML = ` |
| 86 | + <div class="toolbar"> |
| 87 | + <div class="toolbar-title">Vision</div> |
| 88 | + <div class="toolbar-actions"> |
| 89 | + <button class="btn btn-secondary" id="vision-model-btn"> |
| 90 | + ${modelLoaded ? 'Change Model' : 'Load SmolVLM'} |
| 91 | + </button> |
| 92 | + </div> |
| 93 | + </div> |
| 94 | + <div class="scroll-area"> |
| 95 | + <div class="docs-section"> |
| 96 | + <h3>Backend status</h3> |
| 97 | + <ul class="feature-unavailable__list"> |
| 98 | + <li><code>VLM model loaded</code>: <strong>${modelLoaded ? 'yes' : 'no'}</strong></li> |
| 99 | + <li><code>VLMWorkerBridge.isInitialized</code>: <strong>${bridge.isInitialized ? 'yes' : 'no'}</strong></li> |
| 100 | + <li><code>VLMWorkerBridge.isModelLoaded</code>: <strong>${workerLoaded ? 'yes' : 'no'}</strong></li> |
| 101 | + <li><code>camera.isCapturing</code>: <strong>${captureReady ? 'yes' : 'no'}</strong></li> |
| 102 | + </ul> |
| 103 | + </div> |
| 104 | +
|
| 105 | + <div class="docs-section"> |
| 106 | + <h3>Camera</h3> |
| 107 | + <p class="text-secondary">Attach your webcam and capture frames as RGB pixels for VLM inference.</p> |
| 108 | + <div class="toolbar-actions"> |
| 109 | + <button class="btn btn-primary" id="vision-camera-btn" ${isBusy ? 'disabled' : ''}> |
| 110 | + ${captureReady ? 'Stop camera' : 'Start camera'} |
| 111 | + </button> |
| 112 | + <button class="btn btn-secondary" id="vision-capture-btn" ${captureReady && !isBusy ? '' : 'disabled'}> |
| 113 | + Capture frame |
| 114 | + </button> |
| 115 | + </div> |
| 116 | + <div id="vision-preview" class="vision-preview"></div> |
| 117 | + <div id="vision-frame-meta" class="docs-status">${frameMetaLabel()}</div> |
| 118 | + </div> |
| 119 | +
|
| 120 | + <div class="docs-section"> |
| 121 | + <h3>Analyze</h3> |
| 122 | + <p class="text-secondary"> |
| 123 | + Runs <code>VLMWorkerBridge.shared.process(image, options)</code> on the last |
| 124 | + captured frame. The worker decodes the proto message and calls |
| 125 | + <code>_rac_vlm_process_proto</code> off-thread. |
| 126 | + </p> |
| 127 | + <label class="form-label" for="vision-prompt">Prompt</label> |
| 128 | + <textarea id="vision-prompt" class="chat-input" rows="2" |
| 129 | + ${isBusy ? 'disabled' : ''} |
| 130 | + placeholder="What's in this image?">${escape(DEFAULT_PROMPT)}</textarea> |
| 131 | + <div class="toolbar-actions"> |
| 132 | + <button class="btn btn-primary" id="vision-analyze-btn" ${canAnalyze ? '' : 'disabled'}> |
| 133 | + ${isBusy ? 'Analyzing…' : 'Capture & analyze'} |
| 134 | + </button> |
| 135 | + </div> |
| 136 | + <div id="vision-status" class="docs-status">${escape(status)}</div> |
| 137 | + <pre id="vision-output" class="docs-pre">${escape(lastResult ?? '(no response yet)')}</pre> |
| 138 | + </div> |
| 139 | + </div> |
| 140 | + `; |
| 141 | + |
| 142 | + reattachCameraPreview(); |
| 143 | + |
| 144 | + container |
| 145 | + .querySelector('#vision-model-btn')! |
| 146 | + .addEventListener('click', () => openSheet()); |
| 147 | + container |
| 148 | + .querySelector('#vision-camera-btn')! |
| 149 | + .addEventListener('click', () => void toggleCamera()); |
| 150 | + container |
| 151 | + .querySelector('#vision-capture-btn')! |
| 152 | + .addEventListener('click', () => captureFrame()); |
| 153 | + container |
| 154 | + .querySelector('#vision-analyze-btn')! |
| 155 | + .addEventListener('click', () => void onAnalyze()); |
| 156 | +} |
| 157 | + |
| 158 | +function reattachCameraPreview(): void { |
| 159 | + const host = container.querySelector<HTMLElement>('#vision-preview'); |
| 160 | + if (!host || !camera) return; |
| 161 | + host.innerHTML = ''; |
| 162 | + host.appendChild(camera.videoElement); |
| 163 | +} |
| 164 | + |
| 165 | +function frameMetaLabel(): string { |
| 166 | + if (!latestFrame) return 'No frame captured yet.'; |
| 167 | + return `Last frame: ${latestFrame.width}×${latestFrame.height} RGB (${latestFrame.rgbPixels.byteLength.toLocaleString()} bytes)`; |
| 168 | +} |
| 169 | + |
| 170 | +// --------------------------------------------------------------------------- |
| 171 | +// Camera |
| 172 | +// --------------------------------------------------------------------------- |
| 173 | + |
| 174 | +async function toggleCamera(): Promise<void> { |
| 175 | + if (camera?.isCapturing) { |
| 176 | + stopCamera(); |
| 177 | + renderView(); |
| 178 | + return; |
| 179 | + } |
| 180 | + await startCamera(); |
| 181 | +} |
| 182 | + |
| 183 | +async function startCamera(): Promise<void> { |
| 184 | + camera = camera ?? new VideoCapture({ |
| 185 | + facingMode: 'environment', |
| 186 | + idealWidth: 640, |
| 187 | + idealHeight: 480, |
22 | 188 | }); |
| 189 | + isBusy = true; |
| 190 | + setStatus('Requesting camera access…'); |
| 191 | + renderView(); |
| 192 | + try { |
| 193 | + await camera.start(); |
| 194 | + setStatus('Camera ready.'); |
| 195 | + } catch (err) { |
| 196 | + setStatus(`Camera error: ${formatErr(err)}`); |
| 197 | + camera = null; |
| 198 | + } finally { |
| 199 | + isBusy = false; |
| 200 | + renderView(); |
| 201 | + } |
| 202 | +} |
23 | 203 |
|
24 | | - return {}; |
| 204 | +function stopCamera(): void { |
| 205 | + camera?.stop(); |
| 206 | + camera = null; |
| 207 | + latestFrame = null; |
| 208 | +} |
| 209 | + |
| 210 | +function captureFrame(): void { |
| 211 | + if (!camera?.isCapturing) return; |
| 212 | + const frame = camera.captureFrame(CAPTURE_DIMENSION); |
| 213 | + if (!frame) { |
| 214 | + setStatus('Failed to capture frame.'); |
| 215 | + renderView(); |
| 216 | + return; |
| 217 | + } |
| 218 | + latestFrame = frame; |
| 219 | + setStatus(`Captured ${frame.width}×${frame.height} frame.`); |
| 220 | + renderView(); |
| 221 | +} |
| 222 | + |
| 223 | +// --------------------------------------------------------------------------- |
| 224 | +// Analyze |
| 225 | +// --------------------------------------------------------------------------- |
| 226 | + |
| 227 | +async function onAnalyze(): Promise<void> { |
| 228 | + if (!camera?.isCapturing) { |
| 229 | + setStatus('Start the camera first.'); |
| 230 | + renderView(); |
| 231 | + return; |
| 232 | + } |
| 233 | + |
| 234 | + const bridge = VLMWorkerBridge.shared; |
| 235 | + if (!bridge.isModelLoaded) { |
| 236 | + setStatus( |
| 237 | + 'The VLM Worker has no model loaded. Load SmolVLM, then re-run Analyze — ' + |
| 238 | + 'worker-side model plumbing lands once the backend registers a VLM loader.', |
| 239 | + ); |
| 240 | + renderView(); |
| 241 | + return; |
| 242 | + } |
| 243 | + |
| 244 | + const frame = latestFrame ?? camera.captureFrame(CAPTURE_DIMENSION); |
| 245 | + if (!frame) { |
| 246 | + setStatus('Failed to capture a frame for analysis.'); |
| 247 | + renderView(); |
| 248 | + return; |
| 249 | + } |
| 250 | + latestFrame = frame; |
| 251 | + |
| 252 | + const promptEl = container.querySelector<HTMLTextAreaElement>('#vision-prompt'); |
| 253 | + const prompt = (promptEl?.value ?? DEFAULT_PROMPT).trim() || DEFAULT_PROMPT; |
| 254 | + |
| 255 | + const image: VLMImage = { |
| 256 | + filePath: undefined, |
| 257 | + encoded: undefined, |
| 258 | + rawRgb: frame.rgbPixels, |
| 259 | + base64: undefined, |
| 260 | + width: frame.width, |
| 261 | + height: frame.height, |
| 262 | + format: VLMImageFormat.VLM_IMAGE_FORMAT_RAW_RGB, |
| 263 | + mediaType: 'image/rgb', |
| 264 | + name: 'camera-frame', |
| 265 | + sizeBytes: frame.rgbPixels.byteLength, |
| 266 | + metadata: {}, |
| 267 | + }; |
| 268 | + |
| 269 | + const options: VLMGenerationOptions = { |
| 270 | + prompt, |
| 271 | + maxTokens: 128, |
| 272 | + temperature: 0.7, |
| 273 | + topP: 0.9, |
| 274 | + topK: 40, |
| 275 | + stopSequences: [], |
| 276 | + streamingEnabled: false, |
| 277 | + systemPrompt: undefined, |
| 278 | + maxImageSize: CAPTURE_DIMENSION, |
| 279 | + nThreads: 0, |
| 280 | + useGpu: false, |
| 281 | + modelFamily: 0, |
| 282 | + customChatTemplate: undefined, |
| 283 | + imageMarkerOverride: undefined, |
| 284 | + seed: 0, |
| 285 | + repetitionPenalty: 1.1, |
| 286 | + minP: 0.05, |
| 287 | + emitImageEmbeddings: false, |
| 288 | + }; |
| 289 | + |
| 290 | + isBusy = true; |
| 291 | + setStatus('Running VLM inference off-thread…'); |
| 292 | + lastResult = null; |
| 293 | + renderView(); |
| 294 | + |
| 295 | + try { |
| 296 | + const result: VLMResult = await bridge.process(image, options); |
| 297 | + lastResult = result.text || '(empty response)'; |
| 298 | + const tokLine = |
| 299 | + result.tokensPerSecond > 0 |
| 300 | + ? ` — ${result.completionTokens} tokens in ${Math.round(result.processingTimeMs)}ms (${result.tokensPerSecond.toFixed(1)} tok/s)` |
| 301 | + : ''; |
| 302 | + setStatus(`Done${tokLine}.`); |
| 303 | + } catch (err) { |
| 304 | + setStatus(`VLM inference failed: ${formatErr(err)}`); |
| 305 | + } finally { |
| 306 | + isBusy = false; |
| 307 | + renderView(); |
| 308 | + } |
| 309 | +} |
| 310 | + |
| 311 | +// --------------------------------------------------------------------------- |
| 312 | +// Helpers |
| 313 | +// --------------------------------------------------------------------------- |
| 314 | + |
| 315 | +function isVLMModelLoaded(): boolean { |
| 316 | + try { |
| 317 | + const current = RunAnywhere.modelLifecycle.currentModel(); |
| 318 | + return current?.modelId === VLM_MODEL_ID; |
| 319 | + } catch { |
| 320 | + return false; |
| 321 | + } |
| 322 | +} |
| 323 | + |
| 324 | +function setStatus(text: string): void { |
| 325 | + status = text; |
| 326 | + const banner = container.querySelector<HTMLDivElement>('#vision-status'); |
| 327 | + if (banner) banner.textContent = text; |
| 328 | +} |
| 329 | + |
| 330 | +function formatErr(err: unknown): string { |
| 331 | + if (isSDKException(err)) return err.message; |
| 332 | + if (err instanceof Error) return err.message; |
| 333 | + return String(err); |
| 334 | +} |
| 335 | + |
| 336 | +function escape(value: string): string { |
| 337 | + return value |
| 338 | + .replace(/&/g, '&') |
| 339 | + .replace(/</g, '<') |
| 340 | + .replace(/>/g, '>') |
| 341 | + .replace(/"/g, '"') |
| 342 | + .replace(/'/g, '''); |
| 343 | +} |
| 344 | + |
| 345 | +// Dispose subscription on full panel teardown (mirrors chat.ts pattern). |
| 346 | +const disposeObserver = |
| 347 | + typeof MutationObserver !== 'undefined' |
| 348 | + ? new MutationObserver(() => { |
| 349 | + if (container && !container.isConnected) { |
| 350 | + disposeObserver?.disconnect(); |
| 351 | + unsubscribeState?.(); |
| 352 | + unsubscribeState = null; |
| 353 | + } |
| 354 | + }) |
| 355 | + : null; |
| 356 | +if (disposeObserver && typeof document !== 'undefined') { |
| 357 | + document.addEventListener('DOMContentLoaded', () => { |
| 358 | + if (container?.parentElement) { |
| 359 | + disposeObserver.observe(container.parentElement, { childList: true }); |
| 360 | + } |
| 361 | + }); |
25 | 362 | } |
0 commit comments